thomasgauthier commited on
Commit
7a94c8b
β€’
1 Parent(s): 8b05b02
__pycache__/main.cpython-310.pyc ADDED
Binary file (12 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
main.py CHANGED
@@ -16,15 +16,15 @@ from pprint import pprint
16
  import asyncio
17
  import importlib.util
18
  import traceback
19
-
20
  import sys
21
  import json
22
  import jsonschema
23
- # import aiosqlite
24
  from utils import extract_code
25
  import numpy as np
26
  import os
27
  import requests
 
 
28
 
29
  app = FastAPI()
30
 
@@ -32,54 +32,42 @@ client_id = os.getenv("OAUTH_CLIENT_ID")
32
  client_secret = os.getenv("OAUTH_CLIENT_SECRET")
33
  space_host = os.getenv("SPACE_HOST")
34
 
35
- # DATABASE_FILE = "samples.db"
36
-
37
-
38
  client = OpenAI(
39
- base_url="https://openrouter.ai/api/v1",
40
- api_key=os.environ.get('OPENROUTER_KEY')
41
  )
42
 
43
-
44
- # async def setup_database():
45
- # async with aiosqlite.connect(DATABASE_FILE) as db:
46
- # await db.execute("""
47
- # CREATE TABLE IF NOT EXISTS samples (
48
- # hash TEXT PRIMARY KEY,
49
- # data TEXT NOT NULL,
50
- # dataset TEXT NOT NULL
51
- # )
52
- # """)
53
- # await db.commit()
54
-
55
- # async def insert_sample(hash: str, data: str, dataset: str):
56
- # async with aiosqlite.connect(DATABASE_FILE) as db:
57
- # # Check if a record with the same hash already exists
58
- # cursor = await db.execute("SELECT COUNT(*) FROM samples WHERE hash = ?", (hash,))
59
- # count = await cursor.fetchone()
60
-
61
- # if count[0] == 0:
62
- # # Insert the new record since it doesn't exist
63
- # await db.execute("INSERT INTO samples (hash, data, dataset) VALUES (?, ?, ?)", (hash, data, dataset))
64
- # await db.commit()
65
- # else:
66
- # # A record with the same hash already exists
67
- # print("Record with the same hash already exists in the database.")
68
-
69
- # async def get_sample_by_hash(hash: str):
70
- # async with aiosqlite.connect(DATABASE_FILE) as db:
71
- # cursor = await db.execute("SELECT data, dataset FROM samples WHERE hash = ?", (hash,))
72
- # row = await cursor.fetchone()
73
- # return row
74
 
75
  def is_sharegpt(sample):
76
- schema={'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'conversations': {'type': 'array', 'items': {'type': 'object', 'properties': {'from': { 'type': 'string', 'enum': ['human', 'gpt', 'system'] }, 'value': {'type': 'string'}}, 'required': ['from', 'value']}}}, 'required': ['conversations']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  try:
78
  jsonschema.validate(instance=sample, schema=schema)
79
  return True
80
  except jsonschema.exceptions.ValidationError as e:
81
  return False
82
 
 
83
  def sha256(string):
84
  # Create a hashlib object for SHA-256
85
  sha256_hash = hashlib.sha256()
@@ -88,6 +76,7 @@ def sha256(string):
88
 
89
  return sha256_hash.hexdigest()
90
 
 
91
  def get_adapter_name(sample):
92
  builder = SchemaBuilder()
93
  builder.add_object(sample)
@@ -95,6 +84,7 @@ def get_adapter_name(sample):
95
 
96
  return sha256(json.dumps(schema))
97
 
 
98
  def has_adapter(sample):
99
  adapter_name = get_adapter_name(sample)
100
 
@@ -106,37 +96,26 @@ def has_adapter(sample):
106
 
107
  return True
108
 
 
109
  def auto_tranform(sample):
110
  adapter_name = get_adapter_name(sample)
111
  if not has_adapter(sample):
112
  create_adapter(sample, adapter_name)
113
 
114
  module_name = f"dataset_adapters.{adapter_name}"
115
- spec = importlib.util.spec_from_file_location(module_name, f"dataset_adapters/{adapter_name}.py")
 
116
  dynamic_module = importlib.util.module_from_spec(spec)
117
  sys.modules[module_name] = dynamic_module
118
  spec.loader.exec_module(dynamic_module)
119
 
120
- # Use the function from the dynamically imported module
121
  transformed_data = dynamic_module.transform_data(sample)
122
 
123
  if isinstance(transformed_data, list):
124
- return {'conversations' : transformed_data}
125
-
126
 
127
  return transformed_data
128
 
129
-
130
-
131
-
132
- # def create_adapter(sample, adapter_name):
133
- # builder = SchemaBuilder()
134
- # builder.add_object(sample)
135
- # schema = builder.to_schema()
136
-
137
- # code_string = """def transform_data(data):
138
- # raise Exception('')"""
139
-
140
  with open(f"dataset_adapters/{adapter_name}.py", 'w') as file:
141
  file.write(code_string)
142
 
@@ -146,7 +125,7 @@ def create_adapter(sample, adapter_name):
146
  builder.add_object(sample)
147
  schema = builder.to_schema()
148
 
149
- prompt = f"""Make me minimal and efficient python code to convert data in the shape of
150
 
151
  initial data shape
152
  ==========βž‘οΈπŸ“‘πŸ“==========
@@ -177,65 +156,26 @@ For transforming the data you shall use python. Make robust and elegant python c
177
 
178
  your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message. If the data contains no "system" message, human always speaks first. If it contains a "system" message, the "system" message is first, then human, then gpt, then alternating if needed
179
 
180
- "human" ALWAYS SPEAKS BEFORE "gpt", if you suspect your code makes "gpt speak first, fix it
181
 
182
  MOST IMPORTANT IS THAT YOU look at the initial data shape (βž‘οΈπŸ“‘πŸ“) to ground your transformation into final data shape (β¬‡οΈπŸ“‘πŸ“)
183
 
184
  Your output should contain only the code for `def transform_data(data):`, signature and body. Put the code inside markdown code block"""
185
 
186
  response = client.chat.completions.create(
187
- model="openai/gpt-4-1106-preview", # Optional (user controls the default)
188
- messages=[
189
- { "role": "system", "content": """You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.
190
  Knowledge cutoff: 2023-04
191
  Current date: 2023-11-05
192
 
193
- Image input capabilities: Enabled""" },
194
- # {"role": "user", "content": f"""Make me minimal and efficient python code to convert data in the shape of
195
-
196
- # ```jsonschema
197
- # {json.dumps(schema)}
198
- # ```
199
-
200
- # to equivalent data in the form ```
201
- # {{'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {{'conversations': {{'type': 'array', 'items': {{'type': 'object', 'properties': {{'from': {{ 'type': 'string', 'enum': ['human', 'gpt', 'system'] }}, 'value': {{'type': 'string'}}}}, 'required': ['from', 'value']}}}}}}, 'required': ['conversations']}}
202
- # ```
203
-
204
- # the input is
205
- # ```
206
- # {json.dumps(sample)}
207
- # ```
208
-
209
-
210
- # `input` is usually associated with `"from" : "human"` while `output` is usually associated with `"from" : "gpt"`
211
-
212
- # don't transform, make robust and elegant python code that will do the transformation
213
-
214
-
215
- # your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message."""
216
- # }
217
- {"role": "user", "content": prompt}
218
- ]
219
- )
220
 
221
  val = response.choices[0].message.content
222
- # index = val.index('def transform_data(data)')
223
-
224
- # def get_code_start():
225
- # for i in range(index,0,-1):
226
- # if val[i:i+3] == "```":
227
- # idx = val[i:].index('\n')
228
- # return i + (idx) + 1
229
-
230
- # def get_code_end():
231
- # for i in range(index, len(val)):
232
- # if val[i:i+3] == "```":
233
- # return i-1
234
 
235
- # code_string = val[get_code_start():get_code_end()]
236
-
237
-
238
- # print("###", val)
239
  code_string = extract_code(val)
240
 
241
  if code_string is None:
@@ -251,28 +191,59 @@ async def get_sample(hash: str = Query(..., alias="hash")):
251
  if res is None:
252
  raise HTTPException(status_code=404, detail="Item not found")
253
  data, dataset = res
254
- sample= auto_tranform(json.loads(data))
255
  return {'sample': sample, 'dataset': dataset}
256
 
257
- @app.get("/random-sample-stream")
258
- async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str = Query(..., alias="dataset-name"), index: str = Query(None, alias="index")):
 
 
 
 
 
 
259
  queue = asyncio.Queue()
260
- def event_stream(queue):
261
- yield f"data: {json.dumps({'status': 'grab_sample'})}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  try:
 
 
 
 
 
 
 
263
 
 
 
 
 
264
 
265
 
 
 
 
266
 
267
- # dataset = load_dataset(dataset_name,streaming=True)
268
- # split = [key for key in dataset.keys() if "train" in key]
269
-
270
-
271
-
272
-
273
- import requests
274
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
275
  API_URL = f"https://datasets-server.huggingface.co/info?dataset={dataset_name}"
 
276
  def query():
277
  response = requests.get(API_URL, headers=headers)
278
  return response.json()
@@ -283,15 +254,15 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
283
 
284
  num_samples = split['num_examples']
285
  split_name = split['name']
286
-
287
- # dataset = load_dataset(dataset_name, split=split_name, streaming=True)
288
- idx = random.randint(0, num_samples) if index is None else int(index)
289
 
 
 
290
 
291
  API_URL = f"https://datasets-server.huggingface.co/rows?dataset={dataset_name}&config=default&split=train&offset={idx}&length=1"
292
-
293
  def query():
294
- headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
 
295
  response = requests.get(API_URL, headers=headers)
296
 
297
  if response.status_code != 200:
@@ -301,15 +272,7 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
301
 
302
  random_sample = data['rows'][0]['row']
303
 
304
- # pprint(random_sample)
305
-
306
-
307
- # selected = dataset.skip(idx)
308
- # random_sample = next(iter(selected))#random.choice(samples_buffer)
309
-
310
  hashed = sha256(json.dumps(random_sample))
311
- # insert_sample(hashed, json.dumps(random_sample), dataset_name)
312
- # background_tasks.add_task(insert_sample, hashed, json.dumps(random_sample), dataset_name)
313
 
314
  except Exception as e:
315
  message = ""
@@ -317,9 +280,9 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
317
  message = e.message
318
  else:
319
  message = str(e)
320
-
321
  print("error : ", message)
322
- yield f"data: {json.dumps({'status': 'error', 'message' : message })}\n\n"
323
 
324
  transformed_data = random_sample
325
 
@@ -328,7 +291,7 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
328
  if not is_sharegpt(random_sample):
329
  try:
330
  if not has_adapter(random_sample):
331
- yield f"data: {json.dumps({'status': 'creating_adapter'})}\n\n"
332
 
333
  transformed_data = auto_tranform(random_sample)
334
  except Exception as e:
@@ -337,27 +300,29 @@ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str
337
  print("error : ", e.message)
338
  else:
339
  print("error : ", e)
340
- yield f"data: {json.dumps({'status': 'error'})}\n\n"
341
 
342
  if success:
343
- yield f"data: {json.dumps({'status': 'done', 'data' : transformed_data, 'index' : str(idx)})}\n\n"
344
-
345
- return StreamingResponse(event_stream(queue), media_type="text/event-stream")
346
 
 
 
 
347
 
348
 
349
  @app.get("/random-sample")
350
  async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")):
351
  try:
352
- dataset = load_dataset(dataset_name,streaming=True)
353
  split = [key for key in dataset.keys() if "train" in key]
354
  dataset = load_dataset(dataset_name, split=split[0], streaming=True)
355
 
356
  buffer_size = 100 # Define a reasonable buffer size
357
- samples_buffer = [sample for _, sample in zip(range(buffer_size), dataset)]
358
-
359
- random_sample = random.choice(samples_buffer)
360
 
 
361
 
362
  hashed = sha256(json.dumps(random_sample))
363
 
@@ -368,12 +333,12 @@ async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")
368
  if module_spec is None:
369
  create_adapter(random_sample, sanitized)
370
 
371
- spec = importlib.util.spec_from_file_location(module_name, f"dataset_adapters/{sanitized}.py")
 
372
  dynamic_module = importlib.util.module_from_spec(spec)
373
  sys.modules[module_name] = dynamic_module
374
  spec.loader.exec_module(dynamic_module)
375
 
376
- # Use the function from the dynamically imported module
377
  transformed_data = dynamic_module.transform_data(random_sample)
378
 
379
  return transformed_data
@@ -384,16 +349,14 @@ async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")
384
  raise HTTPException(status_code=500, detail=str(e))
385
 
386
 
387
-
388
  @app.get("/login/callback")
389
  async def oauth_callback(code: str, state: str):
390
- # Prepare the authorization header
391
  credentials = f"{client_id}:{client_secret}"
392
  credentials_bytes = credentials.encode("ascii")
393
  base64_credentials = base64.b64encode(credentials_bytes)
394
  auth_header = f"Basic {base64_credentials.decode('ascii')}"
395
  username = ""
396
-
397
  try:
398
  token_response = requests.post(
399
  'https://huggingface.co/oauth/token',
@@ -411,31 +374,26 @@ async def oauth_callback(code: str, state: str):
411
  tokens = token_response.json()
412
  access_token = tokens.get('access_token')
413
 
414
-
415
  if access_token:
416
- try:
417
- user_response = requests.get(
418
- 'https://huggingface.co/api/user',
419
- headers={'Authorization': f'Bearer {access_token}'}
420
- )
421
-
422
- if user_response.status_code == 200:
423
- user_data = user_response.json()
424
- username = user_data['username']
425
- # Now you have the username, you can handle it as needed
426
- else:
427
- username = ""
428
- print(f"Error getting user data: {user_response.status_code}, {user_response.text}")
429
-
430
- except Exception:
431
- traceback.print_exc()
432
- username = ""
433
- else:
434
- username = ""
435
 
 
436
 
437
- # ID Token can be extracted here if needed
438
- # id_token = tokens.get('id_token')
 
 
 
 
439
  else:
440
  access_token = ""
441
 
@@ -443,23 +401,20 @@ async def oauth_callback(code: str, state: str):
443
  traceback.print_exc()
444
  access_token = ""
445
 
446
- return {"access_token": access_token, "username" : username}
 
447
 
448
  @app.get("/oauth-config")
449
  async def get_oauth_config(request: Request):
450
- # client_host = "https://huggingface.co/spaces/thomasgauthier/ChatExplorer#request.client.host
451
  return {
452
  "client_id": client_id,
453
  "redirect_uri": f'https://{space_host}/login/callback'
454
  }
455
 
456
 
457
- # # @app.on_event("startup")
458
- # # async def startup_event():
459
- # # await setup_database()
460
  @app.get("/")
461
  def index() -> FileResponse:
462
  return FileResponse(path="static/index.html", media_type="text/html")
463
 
464
 
465
- app.mount("/", StaticFiles(directory="static"), name="static")
 
16
  import asyncio
17
  import importlib.util
18
  import traceback
 
19
  import sys
20
  import json
21
  import jsonschema
 
22
  from utils import extract_code
23
  import numpy as np
24
  import os
25
  import requests
26
+ import secrets
27
+ import urllib.parse
28
 
29
  app = FastAPI()
30
 
 
32
  client_secret = os.getenv("OAUTH_CLIENT_SECRET")
33
  space_host = os.getenv("SPACE_HOST")
34
 
 
 
 
35
  client = OpenAI(
36
+ base_url="https://openrouter.ai/api/v1",
37
+ api_key=os.environ.get('OPENROUTER_KEY')
38
  )
39
 
40
+ state_queue_map = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def is_sharegpt(sample):
43
+ schema = {
44
+ '$schema': 'http://json-schema.org/schema#',
45
+ 'type': 'object',
46
+ 'properties': {
47
+ 'conversations': {
48
+ 'type': 'array',
49
+ 'items': {
50
+ 'type': 'object',
51
+ 'properties': {
52
+ 'from': {
53
+ 'type': 'string',
54
+ 'enum': [
55
+ 'human',
56
+ 'gpt',
57
+ 'system']},
58
+ 'value': {
59
+ 'type': 'string'}},
60
+ 'required': [
61
+ 'from',
62
+ 'value']}}},
63
+ 'required': ['conversations']}
64
  try:
65
  jsonschema.validate(instance=sample, schema=schema)
66
  return True
67
  except jsonschema.exceptions.ValidationError as e:
68
  return False
69
 
70
+
71
  def sha256(string):
72
  # Create a hashlib object for SHA-256
73
  sha256_hash = hashlib.sha256()
 
76
 
77
  return sha256_hash.hexdigest()
78
 
79
+
80
  def get_adapter_name(sample):
81
  builder = SchemaBuilder()
82
  builder.add_object(sample)
 
84
 
85
  return sha256(json.dumps(schema))
86
 
87
+
88
  def has_adapter(sample):
89
  adapter_name = get_adapter_name(sample)
90
 
 
96
 
97
  return True
98
 
99
+
100
  def auto_tranform(sample):
101
  adapter_name = get_adapter_name(sample)
102
  if not has_adapter(sample):
103
  create_adapter(sample, adapter_name)
104
 
105
  module_name = f"dataset_adapters.{adapter_name}"
106
+ spec = importlib.util.spec_from_file_location(
107
+ module_name, f"dataset_adapters/{adapter_name}.py")
108
  dynamic_module = importlib.util.module_from_spec(spec)
109
  sys.modules[module_name] = dynamic_module
110
  spec.loader.exec_module(dynamic_module)
111
 
 
112
  transformed_data = dynamic_module.transform_data(sample)
113
 
114
  if isinstance(transformed_data, list):
115
+ return {'conversations': transformed_data}
 
116
 
117
  return transformed_data
118
 
 
 
 
 
 
 
 
 
 
 
 
119
  with open(f"dataset_adapters/{adapter_name}.py", 'w') as file:
120
  file.write(code_string)
121
 
 
125
  builder.add_object(sample)
126
  schema = builder.to_schema()
127
 
128
+ prompt = f"""Make me minimal and efficient python code to convert data in the shape of
129
 
130
  initial data shape
131
  ==========βž‘οΈπŸ“‘πŸ“==========
 
156
 
157
  your code will contain a function `def transform_data(data):` that does the transformation and outputs the newly shaped data. Only the data, no schema. Your code snippet will include only the function signature and body. I know how to call it. You won't need to import anything, I will take care of parsing and dumping json. You work with dicts. Remember to be careful if you iterate over the data because I want the output conversation to always start with the prompt. In other words, always process "input" before "output" and "instruction" before "output". Such heuristics are very important. If there is "instruction" and "input" and the "input" is not empty, concat the input at the end of the first message. If the data contains no "system" message, human always speaks first. If it contains a "system" message, the "system" message is first, then human, then gpt, then alternating if needed
158
 
159
+ "human" ALWAYS SPEAKS BEFORE "gpt", if you suspect your code makes "gpt speak first, fix it
160
 
161
  MOST IMPORTANT IS THAT YOU look at the initial data shape (βž‘οΈπŸ“‘πŸ“) to ground your transformation into final data shape (β¬‡οΈπŸ“‘πŸ“)
162
 
163
  Your output should contain only the code for `def transform_data(data):`, signature and body. Put the code inside markdown code block"""
164
 
165
  response = client.chat.completions.create(
166
+ model="openai/gpt-4-1106-preview",
167
+ messages=[
168
+ {"role": "system", "content": """You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.
169
  Knowledge cutoff: 2023-04
170
  Current date: 2023-11-05
171
 
172
+ Image input capabilities: Enabled"""},
173
+ {"role": "user", "content": prompt}
174
+ ]
175
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
  val = response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
178
 
 
 
 
 
179
  code_string = extract_code(val)
180
 
181
  if code_string is None:
 
191
  if res is None:
192
  raise HTTPException(status_code=404, detail="Item not found")
193
  data, dataset = res
194
+ sample = auto_tranform(json.loads(data))
195
  return {'sample': sample, 'dataset': dataset}
196
 
197
+
198
+ def generate_random_string(length=16):
199
+ return secrets.token_hex(length)
200
+
201
+
202
+
203
+ @app.get("/oauth_token")
204
+ async def get_oauth_token():
205
  queue = asyncio.Queue()
206
+
207
+ async def event_stream(queue, state):
208
+ state_queue_map[state] = queue
209
+
210
+ redirect_uri = f'https://{space_host}/login/callback'
211
+
212
+ auth_url = (
213
+ f"https://huggingface.co/oauth/authorize?"
214
+ f"redirect_uri={urllib.parse.quote(redirect_uri)}&"
215
+ f"client_id={client_id}&"
216
+ f"scope=openid%20profile&"
217
+ f"response_type=code&"
218
+ f"state={state}"
219
+ )
220
+ yield f"data: {json.dumps({ 'url' : auth_url })}\n\n"
221
+
222
  try:
223
+ while True:
224
+ message = await queue.get()
225
+ if 'end_stream' in message and message['end_stream']:
226
+ break
227
+ yield f"data: {json.dumps(message)}\n\n"
228
+ finally:
229
+ del state_queue_map[state]
230
 
231
+ state = generate_random_string()
232
+ return StreamingResponse(
233
+ event_stream(queue, state),
234
+ media_type="text/event-stream")
235
 
236
 
237
+ @app.get("/random-sample-stream")
238
+ async def get_random_sample(background_tasks: BackgroundTasks, dataset_name: str = Query(..., alias="dataset-name"), index: str = Query(None, alias="index")):
239
+ queue = asyncio.Queue()
240
 
241
+ def event_stream(queue):
242
+ yield f"data: {json.dumps({'status': 'grab_sample'})}\n\n"
243
+ try:
 
 
 
 
244
  headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
245
  API_URL = f"https://datasets-server.huggingface.co/info?dataset={dataset_name}"
246
+
247
  def query():
248
  response = requests.get(API_URL, headers=headers)
249
  return response.json()
 
254
 
255
  num_samples = split['num_examples']
256
  split_name = split['name']
 
 
 
257
 
258
+ idx = random.randint(
259
+ 0, num_samples) if index is None else int(index)
260
 
261
  API_URL = f"https://datasets-server.huggingface.co/rows?dataset={dataset_name}&config=default&split=train&offset={idx}&length=1"
262
+
263
  def query():
264
+ headers = {
265
+ "Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
266
  response = requests.get(API_URL, headers=headers)
267
 
268
  if response.status_code != 200:
 
272
 
273
  random_sample = data['rows'][0]['row']
274
 
 
 
 
 
 
 
275
  hashed = sha256(json.dumps(random_sample))
 
 
276
 
277
  except Exception as e:
278
  message = ""
 
280
  message = e.message
281
  else:
282
  message = str(e)
283
+
284
  print("error : ", message)
285
+ yield f"data: {json.dumps({'status': 'error', 'message' : message })}\n\n"
286
 
287
  transformed_data = random_sample
288
 
 
291
  if not is_sharegpt(random_sample):
292
  try:
293
  if not has_adapter(random_sample):
294
+ yield f"data: {json.dumps({'status': 'creating_adapter'})}\n\n"
295
 
296
  transformed_data = auto_tranform(random_sample)
297
  except Exception as e:
 
300
  print("error : ", e.message)
301
  else:
302
  print("error : ", e)
303
+ yield f"data: {json.dumps({'status': 'error'})}\n\n"
304
 
305
  if success:
306
+ yield f"data: {json.dumps({'status': 'done', 'data' : transformed_data, 'index' : str(idx)})}\n\n"
 
 
307
 
308
+ return StreamingResponse(
309
+ event_stream(queue),
310
+ media_type="text/event-stream")
311
 
312
 
313
  @app.get("/random-sample")
314
  async def get_random_sample(dataset_name: str = Query(..., alias="dataset-name")):
315
  try:
316
+ dataset = load_dataset(dataset_name, streaming=True)
317
  split = [key for key in dataset.keys() if "train" in key]
318
  dataset = load_dataset(dataset_name, split=split[0], streaming=True)
319
 
320
  buffer_size = 100 # Define a reasonable buffer size
321
+ samples_buffer = [
322
+ sample for _, sample in zip(
323
+ range(buffer_size), dataset)]
324
 
325
+ random_sample = random.choice(samples_buffer)
326
 
327
  hashed = sha256(json.dumps(random_sample))
328
 
 
333
  if module_spec is None:
334
  create_adapter(random_sample, sanitized)
335
 
336
+ spec = importlib.util.spec_from_file_location(
337
+ module_name, f"dataset_adapters/{sanitized}.py")
338
  dynamic_module = importlib.util.module_from_spec(spec)
339
  sys.modules[module_name] = dynamic_module
340
  spec.loader.exec_module(dynamic_module)
341
 
 
342
  transformed_data = dynamic_module.transform_data(random_sample)
343
 
344
  return transformed_data
 
349
  raise HTTPException(status_code=500, detail=str(e))
350
 
351
 
 
352
  @app.get("/login/callback")
353
  async def oauth_callback(code: str, state: str):
 
354
  credentials = f"{client_id}:{client_secret}"
355
  credentials_bytes = credentials.encode("ascii")
356
  base64_credentials = base64.b64encode(credentials_bytes)
357
  auth_header = f"Basic {base64_credentials.decode('ascii')}"
358
  username = ""
359
+
360
  try:
361
  token_response = requests.post(
362
  'https://huggingface.co/oauth/token',
 
374
  tokens = token_response.json()
375
  access_token = tokens.get('access_token')
376
 
 
377
  if access_token:
378
+ url = "https://huggingface.co/oauth/userinfo"
379
+
380
+ payload = ""
381
+ headers = {
382
+ "Content-Type": "application/json",
383
+ "Authorization": f"Bearer {access_token}"
384
+ }
385
+
386
+ response = requests.request(
387
+ "GET", url, data=payload, headers=headers)
 
 
 
 
 
 
 
 
 
388
 
389
+ print(response.text)
390
 
391
+ if state in state_queue_map:
392
+ queue = state_queue_map[state]
393
+ await queue.put({"access_token": access_token, "username": username})
394
+ await queue.put({"end_stream": True})
395
+ else:
396
+ username = ""
397
  else:
398
  access_token = ""
399
 
 
401
  traceback.print_exc()
402
  access_token = ""
403
 
404
+ return {"access_token": access_token, "username": username}
405
+
406
 
407
  @app.get("/oauth-config")
408
  async def get_oauth_config(request: Request):
 
409
  return {
410
  "client_id": client_id,
411
  "redirect_uri": f'https://{space_host}/login/callback'
412
  }
413
 
414
 
 
 
 
415
  @app.get("/")
416
  def index() -> FileResponse:
417
  return FileResponse(path="static/index.html", media_type="text/html")
418
 
419
 
420
+ app.mount("/", StaticFiles(directory="static"), name="static")
static/assets/{index-66194b32.js β†’ index-7974ca0c.js} RENAMED
The diff for this file is too large to render. See raw diff
 
static/index.html CHANGED
@@ -5,7 +5,7 @@
5
  <link rel="icon" type="image/svg+xml" href="/vite.svg" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>Vite + Preact</title>
8
- <script type="module" crossorigin src="/assets/index-66194b32.js"></script>
9
  <link rel="stylesheet" href="/assets/index-abe6d7fb.css">
10
  </head>
11
  <body ondrop="event.preventDefault()" >
 
5
  <link rel="icon" type="image/svg+xml" href="/vite.svg" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>Vite + Preact</title>
8
+ <script type="module" crossorigin src="/assets/index-7974ca0c.js"></script>
9
  <link rel="stylesheet" href="/assets/index-abe6d7fb.css">
10
  </head>
11
  <body ondrop="event.preventDefault()" >