tastypear commited on
Commit
8053a65
1 Parent(s): f3d45c6
Files changed (4) hide show
  1. Dockerfile +11 -0
  2. README.md +11 -11
  3. main.py +76 -0
  4. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "main:app"]
README.md CHANGED
@@ -1,11 +1,11 @@
1
- ---
2
- title: Sia Chat Adapter
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Huggingface Chat API Adapter
3
+ emoji: 💻
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: apache-2.0
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import requests
3
+ from flask import Flask, request, Response, stream_with_context, render_template_string
4
+
5
+ app = Flask(__name__)
6
+
7
+ @app.route('/', methods=['GET'])
8
+ def index():
9
+ template = '''
10
+ <html>
11
+ <head>
12
+ <title>Huggingface Chat API Adapter</title>
13
+ </head>
14
+ <body>
15
+ <h1>Huggingface Chat API Adapter</h1>
16
+
17
+ [Introduction]<br>
18
+ When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
19
+ This API changes these two default settings, and other parameters are consistent with the official API.<br>
20
+ <br>
21
+ [How to use]<br>
22
+ 1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
23
+ 2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
24
+ 3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
25
+ <br>
26
+ [Supported models]<br>
27
+ Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
28
+ Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
29
+ Some models require a token created by a PRO user to use.<br>
30
+ <br>
31
+ [Avoid reaching the call limit]<br>
32
+ If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
33
+ </body>
34
+ </html>
35
+ '''
36
+ return render_template_string(template)
37
+
38
+ @app.route('/api/v1/chat/completions', methods=['POST'])
39
+ def proxy():
40
+ headers = dict(request.headers)
41
+ headers.pop('Host', None)
42
+ headers.pop('Content-Length', None)
43
+ keys = request.headers['Authorization'].split(' ')[1].split(';')
44
+ headers['Authorization'] = f'Bearer {random.choice(keys)}'
45
+ headers['X-Use-Cache'] = 'false'
46
+
47
+ json_data = request.get_json()
48
+ model = json_data['model']
49
+ chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"
50
+
51
+ # Try to use the largest ctx
52
+ if not 'max_tokens' in json_data:
53
+ json_data['max_tokens'] = 2**32-1
54
+ json_data['json_mode'] = True
55
+ info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
56
+ max_ctx = int(info.split("<= ")[1].split(".")[0])
57
+ inputs = int(info.split("Given: ")[1].split("`")[0])
58
+ json_data['json_mode'] = False
59
+ json_data['max_tokens'] = max_ctx - inputs - 1
60
+
61
+ if not 'seed' in json_data:
62
+ json_data['seed'] = random.randint(1,2**32)
63
+
64
+ def generate():
65
+ with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
66
+ for chunk in resp.iter_content(chunk_size=1024):
67
+ if chunk:
68
+ yield chunk
69
+
70
+ return Response(stream_with_context(generate()), content_type='text/event-stream')
71
+
72
+ #import gevent.pywsgi
73
+ #from gevent import monkey;monkey.patch_all()
74
+ if __name__ == "__main__":
75
+ app.run(debug=True)
76
+ # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ requests
2
+ flask
3
+ gunicorn