davanstrien HF staff commited on
Commit
aec7db1
1 Parent(s): 1ebedb6

Update dependencies and add Dockerfile

Browse files
Files changed (5) hide show
  1. Dockerfile +22 -0
  2. README.md +2 -4
  3. app.py → main.py +35 -41
  4. requirements.in +2 -3
  5. requirements.txt +27 -178
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+ # Set up a new user named "user" with user ID 1000
3
+ RUN useradd -m -u 1000 user
4
+
5
+ # Switch to the "user" user
6
+ USER user
7
+
8
+ # Set home to the user's home directory
9
+ ENV HOME=/home/user \
10
+ PATH=/home/user/.local/bin:$PATH
11
+
12
+ # Set the working directory to the user's home directory
13
+ WORKDIR $HOME/code
14
+ WORKDIR /code
15
+
16
+ COPY ./requirements.txt /code/requirements.txt
17
+
18
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
19
+
20
+ COPY . .
21
+
22
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  title: Dataset Language Detection
3
- emoji: 🦀
4
  colorFrom: purple
5
  colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.15.0
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
1
  ---
2
  title: Dataset Language Detection
3
+ emoji: 🌐
4
  colorFrom: purple
5
  colorTo: yellow
6
+ sdk: docker
 
 
7
  pinned: false
8
  license: mit
9
  ---
app.py → main.py RENAMED
@@ -1,16 +1,18 @@
1
  import os
2
  import random
 
3
  from statistics import mean
4
- from typing import Iterator, Union
5
 
6
  import fasttext
7
- import gradio as gr
8
  from dotenv import load_dotenv
9
- from httpx import Client, Timeout
 
10
  from huggingface_hub import hf_hub_download
11
  from huggingface_hub.utils import logging
12
  from toolz import concat, groupby, valmap
13
 
 
14
  logger = logging.get_logger(__name__)
15
  load_dotenv()
16
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -23,7 +25,7 @@ headers = {
23
  }
24
  timeout = Timeout(60, read=120)
25
  client = Client(headers=headers, timeout=timeout)
26
- # async_client = AsyncClient(headers=headers, timeout=timeout)
27
  # non exhaustive list of columns that might contain text which can be used for language detection
28
  # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
29
  TARGET_COLUMN_NAMES = {
@@ -54,9 +56,9 @@ def datasets_server_valid_rows(hub_id: str):
54
  return False
55
 
56
 
57
- def get_first_config_and_split_name(hub_id: str):
58
  try:
59
- resp = client.get(
60
  f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
61
  )
62
 
@@ -67,21 +69,21 @@ def get_first_config_and_split_name(hub_id: str):
67
  return None
68
 
69
 
70
- def get_dataset_info(hub_id: str, config: str | None = None):
71
  if config is None:
72
  config = get_first_config_and_split_name(hub_id)
73
  if config is None:
74
  return None
75
  else:
76
  config = config[0]
77
- resp = client.get(
78
  f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
79
  )
80
  resp.raise_for_status()
81
  return resp.json()
82
 
83
 
84
- def get_random_rows(
85
  hub_id: str,
86
  total_length: int,
87
  number_of_rows: int,
@@ -99,7 +101,7 @@ def get_random_rows(
99
  url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
100
  logger.info(f"Fetching {url}")
101
  print(url)
102
- response = client.get(url)
103
  if response.status_code == 200:
104
  data = response.json()
105
  batch_rows = data.get("rows")
@@ -139,9 +141,15 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
139
  FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
140
 
141
  # model = load_model(DEFAULT_FAST_TEXT_MODEL)
142
-
143
  model = fasttext.load_model(
144
- hf_hub_download("facebook/fasttext-language-identification", "model.bin")
 
 
 
 
 
 
145
  )
146
 
147
 
@@ -187,36 +195,43 @@ def predict_rows(rows, target_column, language_threshold_percent=0.2):
187
  }
188
 
189
 
190
- def predict_language(
 
191
  hub_id: str,
192
  config: str | None = None,
193
  split: str | None = None,
194
  max_request_calls: int = 10,
195
  number_of_rows: int = 1000,
196
- ) -> dict[str, float | str]:
197
  is_valid = datasets_server_valid_rows(hub_id)
198
  if not is_valid:
199
- gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
 
 
200
  if not config:
201
- config, split = get_first_config_and_split_name(hub_id)
202
- info = get_dataset_info(hub_id, config)
 
 
203
  if info is None:
204
- gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
 
205
  if dataset_info := info.get("dataset_info"):
206
  total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
207
  features = dataset_info.get("features")
208
  column_names = set(features.keys())
209
  logger.info(f"Column names: {column_names}")
210
  if not set(column_names).intersection(TARGET_COLUMN_NAMES):
211
- raise gr.Error(
212
  f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
213
  )
 
214
  for column in TARGET_COLUMN_NAMES:
215
  if column in column_names:
216
  target_column = column
217
  logger.info(f"Using column {target_column} for language detection")
218
  break
219
- random_rows = get_random_rows(
220
  hub_id,
221
  total_rows_for_split,
222
  number_of_rows,
@@ -230,24 +245,3 @@ def predict_language(
230
  predictions["config"] = config
231
  predictions["split"] = split
232
  return predictions
233
-
234
-
235
- app_title = "Dataset Language Detection"
236
- app_description = "Detect the language of a dataset on the Hub"
237
- inputs = [
238
- gr.Text(label="dataset id"),
239
- gr.Textbox(
240
- None,
241
- label="config",
242
- ),
243
- gr.Textbox(None, label="split"),
244
- ]
245
- interface = gr.Interface(
246
- predict_language,
247
- inputs=inputs,
248
- outputs="json",
249
- title=app_title,
250
- article=app_description,
251
- )
252
- interface.queue()
253
- interface.launch()
 
1
  import os
2
  import random
3
+ from pathlib import Path
4
  from statistics import mean
5
+ from typing import Any, Iterator, Union
6
 
7
  import fasttext
 
8
  from dotenv import load_dotenv
9
+ from fastapi import FastAPI
10
+ from httpx import AsyncClient, Client, Timeout
11
  from huggingface_hub import hf_hub_download
12
  from huggingface_hub.utils import logging
13
  from toolz import concat, groupby, valmap
14
 
15
+ app = FastAPI()
16
  logger = logging.get_logger(__name__)
17
  load_dotenv()
18
  HF_TOKEN = os.getenv("HF_TOKEN")
 
25
  }
26
  timeout = Timeout(60, read=120)
27
  client = Client(headers=headers, timeout=timeout)
28
+ async_client = AsyncClient(headers=headers, timeout=timeout)
29
  # non exhaustive list of columns that might contain text which can be used for language detection
30
  # we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
31
  TARGET_COLUMN_NAMES = {
 
56
  return False
57
 
58
 
59
+ async def get_first_config_and_split_name(hub_id: str):
60
  try:
61
+ resp = await async_client.get(
62
  f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
63
  )
64
 
 
69
  return None
70
 
71
 
72
+ async def get_dataset_info(hub_id: str, config: str | None = None):
73
  if config is None:
74
  config = get_first_config_and_split_name(hub_id)
75
  if config is None:
76
  return None
77
  else:
78
  config = config[0]
79
+ resp = await async_client.get(
80
  f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
81
  )
82
  resp.raise_for_status()
83
  return resp.json()
84
 
85
 
86
+ async def get_random_rows(
87
  hub_id: str,
88
  total_length: int,
89
  number_of_rows: int,
 
101
  url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
102
  logger.info(f"Fetching {url}")
103
  print(url)
104
+ response = await async_client.get(url)
105
  if response.status_code == 200:
106
  data = response.json()
107
  batch_rows = data.get("rows")
 
141
  FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
142
 
143
  # model = load_model(DEFAULT_FAST_TEXT_MODEL)
144
+ Path("code/models").mkdir(parents=True, exist_ok=True)
145
  model = fasttext.load_model(
146
+ hf_hub_download(
147
+ "facebook/fasttext-language-identification",
148
+ "model.bin",
149
+ cache_dir="code/models",
150
+ local_dir="code/models",
151
+ local_dir_use_symlinks=False,
152
+ )
153
  )
154
 
155
 
 
195
  }
196
 
197
 
198
+ @app.get("/items/{hub_id}")
199
+ async def predict_language(
200
  hub_id: str,
201
  config: str | None = None,
202
  split: str | None = None,
203
  max_request_calls: int = 10,
204
  number_of_rows: int = 1000,
205
+ ) -> dict[Any, Any] | None:
206
  is_valid = datasets_server_valid_rows(hub_id)
207
  if not is_valid:
208
+ logger.error(f"Dataset {hub_id} is not accessible via the datasets server.")
209
+ if not config and not split:
210
+ config, split = await get_first_config_and_split_name(hub_id)
211
  if not config:
212
+ config, _ = await get_first_config_and_split_name(hub_id)
213
+ if not split:
214
+ _, split = await get_first_config_and_split_name(hub_id)
215
+ info = await get_dataset_info(hub_id, config)
216
  if info is None:
217
+ logger.error(f"Dataset {hub_id} is not accessible via the datasets server.")
218
+ return None
219
  if dataset_info := info.get("dataset_info"):
220
  total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
221
  features = dataset_info.get("features")
222
  column_names = set(features.keys())
223
  logger.info(f"Column names: {column_names}")
224
  if not set(column_names).intersection(TARGET_COLUMN_NAMES):
225
+ logger.error(
226
  f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
227
  )
228
+ return None
229
  for column in TARGET_COLUMN_NAMES:
230
  if column in column_names:
231
  target_column = column
232
  logger.info(f"Using column {target_column} for language detection")
233
  break
234
+ random_rows = await get_random_rows(
235
  hub_id,
236
  total_rows_for_split,
237
  number_of_rows,
 
245
  predictions["config"] = config
246
  predictions["split"] = split
247
  return predictions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.in CHANGED
@@ -3,7 +3,6 @@ httpx
3
  huggingface_hub
4
  rich
5
  toolz
6
- gradio
7
  python-dotenv
8
- datasets
9
- iso639-lang
 
3
  huggingface_hub
4
  rich
5
  toolz
 
6
  python-dotenv
7
+ uvicorn[standard]
8
+ fastapi
requirements.txt CHANGED
@@ -4,27 +4,13 @@
4
  #
5
  # pip-compile
6
  #
7
- aiofiles==23.2.1
8
- # via gradio
9
- aiohttp==3.9.1
10
- # via
11
- # datasets
12
- # fsspec
13
- aiosignal==1.3.1
14
- # via aiohttp
15
- altair==5.2.0
16
- # via gradio
17
  annotated-types==0.6.0
18
  # via pydantic
19
  anyio==4.2.0
20
  # via
21
  # httpx
22
  # starlette
23
- attrs==23.2.0
24
- # via
25
- # aiohttp
26
- # jsonschema
27
- # referencing
28
  certifi==2023.11.17
29
  # via
30
  # httpcore
@@ -33,223 +19,86 @@ certifi==2023.11.17
33
  charset-normalizer==3.3.2
34
  # via requests
35
  click==8.1.7
36
- # via
37
- # typer
38
- # uvicorn
39
- colorama==0.4.6
40
- # via typer
41
- contourpy==1.2.0
42
- # via matplotlib
43
- cycler==0.12.1
44
- # via matplotlib
45
- datasets==2.14.4
46
- # via -r requirements.in
47
- dill==0.3.7
48
- # via
49
- # datasets
50
- # multiprocess
51
  fastapi==0.109.0
52
- # via gradio
53
  fasttext==0.9.2
54
  # via -r requirements.in
55
- ffmpy==0.3.1
56
- # via gradio
57
  filelock==3.13.1
58
  # via huggingface-hub
59
- fonttools==4.47.2
60
- # via matplotlib
61
- frozenlist==1.4.1
62
- # via
63
- # aiohttp
64
- # aiosignal
65
- fsspec[http]==2023.12.2
66
- # via
67
- # datasets
68
- # gradio-client
69
- # huggingface-hub
70
- gradio==4.15.0
71
- # via -r requirements.in
72
- gradio-client==0.8.1
73
- # via gradio
74
  h11==0.14.0
75
  # via
76
  # httpcore
77
  # uvicorn
78
  httpcore==1.0.2
79
  # via httpx
 
 
80
  httpx==0.26.0
81
- # via
82
- # -r requirements.in
83
- # gradio
84
- # gradio-client
85
  huggingface-hub==0.20.3
86
- # via
87
- # -r requirements.in
88
- # datasets
89
- # gradio
90
- # gradio-client
91
  idna==3.6
92
  # via
93
  # anyio
94
  # httpx
95
  # requests
96
- # yarl
97
- importlib-resources==6.1.1
98
- # via gradio
99
- iso639-lang==2.2.2
100
- # via -r requirements.in
101
- jinja2==3.1.3
102
- # via
103
- # altair
104
- # gradio
105
- jsonschema==4.21.1
106
- # via altair
107
- jsonschema-specifications==2023.12.1
108
- # via jsonschema
109
- kiwisolver==1.4.5
110
- # via matplotlib
111
  markdown-it-py==3.0.0
112
  # via rich
113
- markupsafe==2.1.4
114
- # via
115
- # gradio
116
- # jinja2
117
- matplotlib==3.8.2
118
- # via gradio
119
  mdurl==0.1.2
120
  # via markdown-it-py
121
- multidict==6.0.4
122
- # via
123
- # aiohttp
124
- # yarl
125
- multiprocess==0.70.15
126
- # via datasets
127
  numpy==1.26.3
128
- # via
129
- # altair
130
- # contourpy
131
- # datasets
132
- # fasttext
133
- # gradio
134
- # matplotlib
135
- # pandas
136
- # pyarrow
137
- orjson==3.9.12
138
- # via gradio
139
  packaging==23.2
140
- # via
141
- # altair
142
- # datasets
143
- # gradio
144
- # gradio-client
145
- # huggingface-hub
146
- # matplotlib
147
- pandas==2.2.0
148
- # via
149
- # altair
150
- # datasets
151
- # gradio
152
- pillow==10.2.0
153
- # via
154
- # gradio
155
- # matplotlib
156
- pyarrow==15.0.0
157
- # via datasets
158
  pybind11==2.11.1
159
  # via fasttext
160
  pydantic==2.5.3
161
- # via
162
- # fastapi
163
- # gradio
164
  pydantic-core==2.14.6
165
  # via pydantic
166
- pydub==0.25.1
167
- # via gradio
168
  pygments==2.17.2
169
  # via rich
170
- pyparsing==3.1.1
171
- # via matplotlib
172
- python-dateutil==2.8.2
173
- # via
174
- # matplotlib
175
- # pandas
176
  python-dotenv==1.0.1
177
- # via -r requirements.in
178
- python-multipart==0.0.6
179
- # via gradio
180
- pytz==2023.3.post1
181
- # via pandas
182
  pyyaml==6.0.1
183
  # via
184
- # datasets
185
- # gradio
186
  # huggingface-hub
187
- referencing==0.32.1
188
- # via
189
- # jsonschema
190
- # jsonschema-specifications
191
  requests==2.31.0
192
- # via
193
- # datasets
194
- # fsspec
195
- # huggingface-hub
196
  rich==13.7.0
197
- # via
198
- # -r requirements.in
199
- # typer
200
- rpds-py==0.17.1
201
- # via
202
- # jsonschema
203
- # referencing
204
- ruff==0.1.14
205
- # via gradio
206
- semantic-version==2.10.0
207
- # via gradio
208
- shellingham==1.5.4
209
- # via typer
210
- six==1.16.0
211
- # via python-dateutil
212
  sniffio==1.3.0
213
  # via
214
  # anyio
215
  # httpx
216
  starlette==0.35.1
217
  # via fastapi
218
- tomlkit==0.12.0
219
- # via gradio
220
  toolz==0.12.0
221
- # via
222
- # -r requirements.in
223
- # altair
224
  tqdm==4.66.1
225
- # via
226
- # datasets
227
- # huggingface-hub
228
- typer[all]==0.9.0
229
- # via
230
- # gradio
231
- # typer
232
  typing-extensions==4.9.0
233
  # via
234
  # fastapi
235
- # gradio
236
- # gradio-client
237
  # huggingface-hub
238
  # pydantic
239
  # pydantic-core
240
- # typer
241
- tzdata==2023.4
242
- # via pandas
243
  urllib3==2.1.0
244
  # via requests
245
- uvicorn==0.27.0
246
- # via gradio
 
 
 
 
247
  websockets==11.0.3
248
- # via gradio-client
249
- xxhash==3.4.1
250
- # via datasets
251
- yarl==1.9.4
252
- # via aiohttp
253
 
254
  # The following packages are considered to be unsafe in a requirements file:
255
  # setuptools
 
4
  #
5
  # pip-compile
6
  #
 
 
 
 
 
 
 
 
 
 
7
  annotated-types==0.6.0
8
  # via pydantic
9
  anyio==4.2.0
10
  # via
11
  # httpx
12
  # starlette
13
+ # watchfiles
 
 
 
 
14
  certifi==2023.11.17
15
  # via
16
  # httpcore
 
19
  charset-normalizer==3.3.2
20
  # via requests
21
  click==8.1.7
22
+ # via uvicorn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  fastapi==0.109.0
24
+ # via -r requirements.in
25
  fasttext==0.9.2
26
  # via -r requirements.in
 
 
27
  filelock==3.13.1
28
  # via huggingface-hub
29
+ fsspec==2023.12.2
30
+ # via huggingface-hub
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  h11==0.14.0
32
  # via
33
  # httpcore
34
  # uvicorn
35
  httpcore==1.0.2
36
  # via httpx
37
+ httptools==0.6.1
38
+ # via uvicorn
39
  httpx==0.26.0
40
+ # via -r requirements.in
 
 
 
41
  huggingface-hub==0.20.3
42
+ # via -r requirements.in
 
 
 
 
43
  idna==3.6
44
  # via
45
  # anyio
46
  # httpx
47
  # requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  markdown-it-py==3.0.0
49
  # via rich
 
 
 
 
 
 
50
  mdurl==0.1.2
51
  # via markdown-it-py
 
 
 
 
 
 
52
  numpy==1.26.3
53
+ # via fasttext
 
 
 
 
 
 
 
 
 
 
54
  packaging==23.2
55
+ # via huggingface-hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  pybind11==2.11.1
57
  # via fasttext
58
  pydantic==2.5.3
59
+ # via fastapi
 
 
60
  pydantic-core==2.14.6
61
  # via pydantic
 
 
62
  pygments==2.17.2
63
  # via rich
 
 
 
 
 
 
64
  python-dotenv==1.0.1
65
+ # via
66
+ # -r requirements.in
67
+ # uvicorn
 
 
68
  pyyaml==6.0.1
69
  # via
 
 
70
  # huggingface-hub
71
+ # uvicorn
 
 
 
72
  requests==2.31.0
73
+ # via huggingface-hub
 
 
 
74
  rich==13.7.0
75
+ # via -r requirements.in
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  sniffio==1.3.0
77
  # via
78
  # anyio
79
  # httpx
80
  starlette==0.35.1
81
  # via fastapi
 
 
82
  toolz==0.12.0
83
+ # via -r requirements.in
 
 
84
  tqdm==4.66.1
85
+ # via huggingface-hub
 
 
 
 
 
 
86
  typing-extensions==4.9.0
87
  # via
88
  # fastapi
 
 
89
  # huggingface-hub
90
  # pydantic
91
  # pydantic-core
 
 
 
92
  urllib3==2.1.0
93
  # via requests
94
+ uvicorn[standard]==0.27.0
95
+ # via -r requirements.in
96
+ uvloop==0.19.0
97
+ # via uvicorn
98
+ watchfiles==0.21.0
99
+ # via uvicorn
100
  websockets==11.0.3
101
+ # via uvicorn
 
 
 
 
102
 
103
  # The following packages are considered to be unsafe in a requirements file:
104
  # setuptools