|
import json |
|
import random |
|
import string |
|
import time |
|
from typing import Any |
|
import requests |
|
from flask import Flask, request |
|
from flask_cors import CORS |
|
from transformers import AutoTokenizer |
|
from g4f import ChatCompletion |
|
|
|
app = Flask(__name__) |
|
CORS(app) |
|
|
|
|
|
@app.route("/chat/completions", methods=["POST"]) |
|
def chat_completions(): |
|
model = request.get_json().get("model", "gpt-3.5-turbo") |
|
stream = request.get_json().get("stream", False) |
|
messages = request.get_json().get("messages") |
|
|
|
response = ChatCompletion.create(model=model, stream=stream, messages=messages) |
|
|
|
completion_id = "".join(random.choices(string.ascii_letters + string.digits, k=28)) |
|
completion_timestamp = int(time.time()) |
|
|
|
if not stream: |
|
return { |
|
"id": f"chatcmpl-{completion_id}", |
|
"object": "chat.completion", |
|
"created": completion_timestamp, |
|
"model": model, |
|
"choices": [ |
|
{ |
|
"index": 0, |
|
"message": { |
|
"role": "assistant", |
|
"content": response, |
|
}, |
|
"finish_reason": "stop", |
|
} |
|
], |
|
"usage": { |
|
"prompt_tokens": None, |
|
"completion_tokens": None, |
|
"total_tokens": None, |
|
}, |
|
} |
|
|
|
def streaming(): |
|
for chunk in response: |
|
completion_data = { |
|
"id": f"chatcmpl-{completion_id}", |
|
"object": "chat.completion.chunk", |
|
"created": completion_timestamp, |
|
"model": model, |
|
"choices": [ |
|
{ |
|
"index": 0, |
|
"delta": { |
|
"content": chunk, |
|
}, |
|
"finish_reason": None, |
|
} |
|
], |
|
} |
|
|
|
content = json.dumps(completion_data, separators=(",", ":")) |
|
yield f"data: {content}\n\n" |
|
time.sleep(0.1) |
|
|
|
end_completion_data: dict[str, Any] = { |
|
"id": f"chatcmpl-{completion_id}", |
|
"object": "chat.completion.chunk", |
|
"created": completion_timestamp, |
|
"model": model, |
|
"choices": [ |
|
{ |
|
"index": 0, |
|
"delta": {}, |
|
"finish_reason": "stop", |
|
} |
|
], |
|
} |
|
content = json.dumps(end_completion_data, separators=(",", ":")) |
|
yield f"data: {content}\n\n" |
|
|
|
return app.response_class(streaming(), mimetype="text/event-stream") |
|
|
|
|
|
|
|
def get_embedding(input_text, token): |
|
huggingface_token = token |
|
embedding_model = "sentence-transformers/all-mpnet-base-v2" |
|
max_token_length = 500 |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(embedding_model) |
|
|
|
tokens = tokenizer.tokenize(input_text) |
|
token_chunks = [tokens[i:i + max_token_length] for i in range(0, len(tokens), max_token_length)] |
|
|
|
|
|
embeddings = [] |
|
|
|
|
|
for chunk in token_chunks: |
|
|
|
chunk_text = tokenizer.convert_tokens_to_string(chunk) |
|
|
|
|
|
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{embedding_model}" |
|
headers = {"Authorization": f"Bearer {huggingface_token}"} |
|
chunk_text = chunk_text.replace("\n", " ") |
|
|
|
|
|
response = requests.post(api_url, headers=headers, json={"inputs": chunk_text, "options": {"wait_for_model": True}}) |
|
|
|
|
|
chunk_embedding = response.json() |
|
|
|
embeddings.append(chunk_embedding) |
|
|
|
|
|
|
|
|
|
num_embeddings = len(embeddings) |
|
average_embedding = [sum(x) / num_embeddings for x in zip(*embeddings)] |
|
embedding = average_embedding |
|
return embedding |
|
|
|
|
|
@app.route("/embeddings", methods=["POST"]) |
|
def embeddings(): |
|
input_text_list = request.get_json().get("input") |
|
input_text = ' '.join(map(str, input_text_list)) |
|
token = request.headers.get('Authorization').replace("Bearer ", "") |
|
embedding = get_embedding(input_text, token) |
|
return { |
|
"data": [ |
|
{ |
|
"embedding": embedding, |
|
"index": 0, |
|
"object": "embedding" |
|
} |
|
], |
|
"model": "text-embedding-ada-002", |
|
"object": "list", |
|
"usage": { |
|
"prompt_tokens": None, |
|
"total_tokens": None |
|
} |
|
} |
|
|
|
def main(): |
|
app.run(host="0.0.0.0", port=1337, debug=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |