Spaces:

GIZ
/

audit_assistant

Running on T4

ppsingh commited on Oct 22, 2024

Commit

06bf5a5

verified ·

1 Parent(s): 5620e1f

Create reader.py

Files changed (1) hide show

auditqa/reader.py ADDED Viewed

+from huggingface_hub import InferenceClient
+from auditqa.process_chunks import getconfig
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.chat_models.huggingface import ChatHuggingFace
+import os
+from dotenv import load_dotenv
+load_dotenv()
+model_config = getconfig("model_params.cfg")
+NVIDIA_SERVER = os.environ["NVIDIA_SERVERLESS"]
+HF_token = os.environ["LLAMA_3_1"]
+def nvidia_client():
+    client = InferenceClient(
+    base_url=model_config.get('reader','NVIDIA_ENDPOINT'),
+    api_key=NVIDIA_SERVER)
+    return client
+def dedicated_endpoint():
+     # Set up the streaming callback handler
+    callback = StreamingStdOutCallbackHandler()
+    # Initialize the HuggingFaceEndpoint with streaming enabled
+    llm_qa = HuggingFaceEndpoint(
+        endpoint_url=model_config.get('reader', 'DEDICATED_ENDPOINT'),
+        max_new_tokens=int(model_config.get('reader','MAX_TOKENS')),
+        repetition_penalty=1.03,
+        timeout=70,
+        huggingfacehub_api_token=HF_token,
+        streaming=True, # Enable streaming for real-time token generation
+        callbacks=[callback] # Add the streaming callback handler
+    )
+    # Create a ChatHuggingFace instance with the streaming-enabled endpoint
+    chat_model = ChatHuggingFace(llm=llm_qa)
+    return chat_model