from fastapi import FastAPI, HTTPException from transformers import AutoModelForCausalLM, AutoTokenizer import os import copy import time import llama_cpp from llama_cpp import Llama from huggingface_hub import hf_hub_download import transformers import torch app = FastAPI() """ model_path = hf_hub_download( repo_id="TheBloke/Mistral-7B-v0.1-GGUF", filename="mistral-7b-v0.1.Q4_K_M.gguf") llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2 ) """ @app.get("/") async def generate_text(): try: """ output = llm( "Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) output = llm.create_chat_completion( messages=[ { "role": "system", "content": "You are a helpful assistant that outputs in JSON.", }, {"role": "user", "content": "Who won the world series in 2020"}, ], response_format={ "type": "json_object", }, temperature=0.7, ) """ model_id = "meta-llama/Meta-Llama-3-8B" pipeline = transformers.pipeline("text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto") pipeline("Hey how are you doing today?") return pipeline("Hey how are you doing today?") except Exception as e: raise HTTPException(status_code=500, detail=str(e))