ModernBert-Similarity

Running

App Files Files Community

eswardivi commited on Apr 23, 2024

Commit

9815e35

verified ·

1 Parent(s): 39a93b5

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -16

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     TextIteratorStreamer,
-    BitsAndBytesConfig,
 )
 import os
 from threading import Thread
@@ -13,12 +12,9 @@ import time
 token = os.environ["HF_TOKEN"]
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
-)
 model = AutoModelForCausalLM.from_pretrained(
-    "microsoft/Phi-3-mini-4k-instruct", quantization_config=quantization_config, token=token,trust_remote_code=True
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
 terminators = [
@@ -32,13 +28,12 @@ else:
     device = torch.device("cpu")
     print("Using CPU")
-# model = model.to(device)
 # Dispatch Errors
 @spaces.GPU(duration=60)
 def chat(message, history, temperature,do_sample, max_tokens):
-    start_time = time.time()
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -66,19 +61,12 @@ def chat(message, history, temperature,do_sample, max_tokens):
     t.start()
     partial_text = ""
-    first_token_time = None
     for new_text in streamer:
-        if not first_token_time:
-            first_token_time = time.time() - start_time
         partial_text += new_text
         yield partial_text
-    total_time = time.time() - start_time
-    tokens = len(tok.tokenize(partial_text))
-    tokens_per_second = tokens / total_time if total_time > 0 else 0
-    timing_info = f"\n\nTime taken to first token: {first_token_time:.2f} seconds\nTokens per second: {tokens_per_second:.2f}"
-    yield partial_text +  timing_info
 demo = gr.ChatInterface(
@@ -104,6 +92,6 @@ demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
-    description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.com/microsoft/Phi-3-mini-4k-instruct) in 4bit"
 )
 demo.launch()

     AutoModelForCausalLM,
     AutoTokenizer,
     TextIteratorStreamer,
 )
 import os
 from threading import Thread
 token = os.environ["HF_TOKEN"]
 model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-4k-instruct", token=token,trust_remote_code=True
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
 terminators = [
     device = torch.device("cpu")
     print("Using CPU")
+model = model.to(device)
 # Dispatch Errors
 @spaces.GPU(duration=60)
 def chat(message, history, temperature,do_sample, max_tokens):
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
     t.start()
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
+    yield partial_text
 demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
+    description="Now Running [microsoft/Phi-3-mini-4k-instruct](https://huggingface.com/microsoft/Phi-3-mini-4k-instruct)"
 )
 demo.launch()