Spaces:

LVKinyanjui
/

QueryYourDocs

Runtime error

App Files Files Community

LVKinyanjui commited on about 1 month ago

Commit

1ad978f

•

1 Parent(s): 728c92a

Abstracted away inference implementation and succesfully tested the instruct template

Browse files

Files changed (4) hide show

Dockerfile +1 -1
inference_main.py +10 -0
modules/inference/{llama3_1_8b_instruct.py → instruct.py} +30 -28
requirements.txt +4 -2

Dockerfile CHANGED Viewed

@@ -19,4 +19,4 @@ COPY . .
 EXPOSE 8000
 # Run the application.
-CMD streamlit run app_inference.py  --server.port 7860

 EXPOSE 8000
 # Run the application.
+CMD streamlit run inference_main.py  --server.port 7860

inference_main.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+from modules.inference.llama3_1_8b_instruct import infer
+st.write("## Ask your Local LLM")
+text_input = st.text_input("Query", value="Why is the sky Blue")
+submit = st.button("Submit")
+if submit:
+    response = infer(text_input)
+    response

modules/inference/{llama3_1_8b_instruct.py → instruct.py} RENAMED Viewed

@@ -1,45 +1,41 @@
-import streamlit as st
 import transformers, torch
 import json, os
 from huggingface_hub import login
 # CONSTANTS
-MAX_NEW_TOKENS = 256
 SYSTEM_MESSAGE = "You are a hepful, knowledgeable assistant"
-# ENV VARS
-# To avert Permision error with transformer and hf models
-os.environ['SENTENCE_TRANSFORMERS_HOME'] = '.'
-token = os.getenv("HF_TOKEN_WRITE") # Must be a write token
-# STREAMLIT UI AREA
-st.write("## Ask your Local LLM")
-text_input = st.text_input("Query", value="Why is the sky Blue")
-submit = st.button("Submit")
-# MODEL AREA
-# Use the token to authenticate
-login(token=token,
-      write_permission=True     # Must be set to True when we pass in our own token
-                                # Otherwise we get Permission Denied.
-      )
-model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-@st.cache_resource
 def load_model():
     pipeline = transformers.pipeline(
         "text-generation",
         model=model_id,
         model_kwargs={"torch_dtype": torch.bfloat16},
         device_map="auto",
     )
 pipeline = load_model()
 message_store_path = "messages.jsonl"
-messages = [
     {"role": "system", "content": SYSTEM_MESSAGE},
 ]
@@ -48,13 +44,10 @@ if os.path.exists(message_store_path):
         messages = [json.loads(line) for line in f]
     print(messages)
-@st.cache_data
-def infer(message: str, messages: list[dict]):
     """
     Params:
         message: Most recent query to the llm.
-        messages: Chat history up to current point properly formatted like
-            {"role": "user", "content": "What is your name?"}
     """
     messages.append({"role": "user", "content": message})
@@ -63,14 +56,23 @@ def infer(message: str, messages: list[dict]):
         messages,
         max_new_tokens=MAX_NEW_TOKENS)
     # Save the newly updated messages object
     with open(message_store_path, "w", encoding="utf-8") as f:
         for line in output:
             json.dump(line, f)
             f.write("\n")
-    return output[-1]['generated_text'][-1]['content']
-if submit:
-    response = infer(text_input, messages)
-    response

 import transformers, torch
 import json, os
 from huggingface_hub import login
 # CONSTANTS
+MAX_NEW_TOKENS = 1024
 SYSTEM_MESSAGE = "You are a hepful, knowledgeable assistant"
+# # ENV VARS
+# # To avert Permision error with transformer and hf models
+# os.environ['SENTENCE_TRANSFORMERS_HOME'] = '.'
+# token = os.getenv("HF_TOKEN_WRITE") # Must be a write token
+# # Use the token to authenticate
+# login(token=token,
+#       write_permission=True     # Must be set to True when we pass in our own token
+#                                 # Otherwise we get Permission Denied.
+#       )
+model_id = "microsoft/Phi-3.5-mini-instruct"
+# model_id = "meta-llama/Llama-3.2-1B-Instruct"
 def load_model():
+    print(f"Loading {model_id}")
     pipeline = transformers.pipeline(
         "text-generation",
         model=model_id,
         model_kwargs={"torch_dtype": torch.bfloat16},
         device_map="auto",
     )
+    return pipeline
 pipeline = load_model()
 message_store_path = "messages.jsonl"
+messages: list[dict] = [
     {"role": "system", "content": SYSTEM_MESSAGE},
 ]
         messages = [json.loads(line) for line in f]
     print(messages)
+def infer(message: str):
     """
     Params:
         message: Most recent query to the llm.
     """
     messages.append({"role": "user", "content": message})
         messages,
         max_new_tokens=MAX_NEW_TOKENS)
+    output_text = output[-1]['generated_text'][-1]['content']
     # Save the newly updated messages object
     with open(message_store_path, "w", encoding="utf-8") as f:
         for line in output:
             json.dump(line, f)
             f.write("\n")
+    return output_text
+if __name__ == "__main__":
+    while True:
+        print("Press Ctrl + C to exit.")
+        message = input("Ask a question.")
+        print(infer(message))
+        print("---------------------------------------")
+        print("\n\n")
+        print(messages)

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 chromadb==0.5.5
 pymupdf==1.24.9
 streamlit==1.38.0
-transformers==4.44.2
 langchain==0.3.0
 langchain-core==0.3.5
 langchain-text-splitters==0.3.0
@@ -10,5 +13,4 @@ langchain-community==0.3.0
 python-dotenv==1.0.1
 tiktoken==0.7.0
 huggingface-hub==0.25.1
-torch==2.4.1
 langchain-ollama==0.2.0

+flash_attn==2.5.8
+torch==2.3.1
+accelerate==0.31.0
+transformers==4.43.0
 chromadb==0.5.5
 pymupdf==1.24.9
 streamlit==1.38.0
 langchain==0.3.0
 langchain-core==0.3.5
 langchain-text-splitters==0.3.0
 python-dotenv==1.0.1
 tiktoken==0.7.0
 huggingface-hub==0.25.1
 langchain-ollama==0.2.0