Spaces:
Sleeping
Sleeping
File size: 1,839 Bytes
9ef2a14 fa554aa 714b9dd 9ef2a14 714b9dd 9ef2a14 fa554aa 714b9dd fa554aa 714b9dd fa554aa 714b9dd ddf2ccc fa554aa 9ef2a14 fa554aa c99fd10 fa554aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# external imports
from transformers import pipeline
from huggingface_hub import InferenceClient
import torch
# local imports
import config
from llama_cpp import Llama
class Phi3_Mini_4k_Instruct:
def __init__(self):
pass
def generate_text(self, messages, use_local_llm):
if use_local_llm:
return self.generate_text_llama_cpp(messages)
else:
return self.generate_text_api(messages)
def generate_text_llama_cpp(self, messages):
model = Llama.from_pretrained(
repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
filename="Phi-3-mini-4k-instruct-q4.gguf"
)
response = model.create_chat_completion(messages)
generated_message = response['choices'][0]['message']['content']
return generated_message
def generate_text_local_pipeline(self, messages):
self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto")
self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE
self.local_pipeline.model.config.top_p = config.LLM_TOP_P
result = self.local_pipeline(messages)[-1]['generated_text'][-1]['content']
return result
def generate_text_api(self, messages):
client = InferenceClient(config.LLM_MODEL, token=config.HF_API_TOKEN)
try:
result = client.chat_completion(messages, max_tokens=config.LLM_MAX_NEW_TOKENS, temperature=config.LLM_TEMPERATURE, top_p=config.LLM_TOP_P).choices[0].message.content
except Exception as e:
result = f"Error: {e}"
return result |