Spaces:

randomblock1
/

phi-2

Sleeping

App Files Files Community

Benjamin G commited on Dec 15, 2023

Commit

0439661

•

1 Parent(s): c4f947a

added streaming

Browse files

Files changed (2) hide show

app.py +79 -22
requirements.txt +0 -15

app.py CHANGED Viewed

@@ -3,47 +3,70 @@ from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
     TextIteratorStreamer,
-    StoppingCriteriaList,
 )
 from threading import Thread
 import gradio as gr
-if torch.cuda.is_available():
-    torch.set_default_device("cuda")
 tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     "microsoft/phi-2",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,
 )
-def Phi2StoppingCriteria(
-    input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs
-) -> bool:
-    stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
-    stop_tokens = []
-    for stop in stop_list:
-        stop_tokens.append(
-            tokenizer(stop, add_special_tokens=False, return_tensors="pt").input_ids
-        )
-    return input_ids[-1] in stop_tokens
-stopping_criteria = StoppingCriteriaList([Phi2StoppingCriteria])
-def generate(prompt, max_new_tokens):
     inputs = tokenizer(prompt, return_tensors="pt")
     # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
-    streamer = TextIteratorStreamer(inputs)
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
-        do_sample=True,
-        stopping_criteria=stopping_criteria,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -61,7 +84,41 @@ demo = gr.Interface(
             label="prompt",
             value="Write a detailed analogy between mathematics and a lighthouse.",
         ),
-        gr.Number(value=100, label="max new tokens", maximum=500),
     ],
     outputs="text",
     examples=[
@@ -84,7 +141,7 @@ demo = gr.Interface(
    """\n''',
             100,
         ],
-        ["User: How does sleep affect mood?\nAI:", 125],
         ["Who was Ada Lovelace?", 100],
         ["Explain the concept of skip lists.", 125],
     ],

     AutoTokenizer,
     AutoModelForCausalLM,
     TextIteratorStreamer,
+    StoppingCriteria,
 )
 from threading import Thread
 import gradio as gr
+# has_gpu = torch.cuda.is_available()
+has_gpu = False
+device = "cuda" if has_gpu else "cpu"
+torch.set_default_device(device)
 tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     "microsoft/phi-2",
+    # torch_dtype=torch.float16 if has_gpu else torch.float32,
+    torch_dtype=torch.float32,
+    device_map=device,
     trust_remote_code=True,
 )
+# custom stopping criteria (avoid generating hallucinated prompts)
+# still includes these tokens in the output but stops generating after them
+class Phi2StoppingCriteria(StoppingCriteria):
+    def __init__(self):
+        stop_list = ["Exercise", "Exercises", "<|endoftext|>"]
+        tokenphrases = []
+        for token in stop_list:
+            tokenphrases.append(
+                tokenizer(token, return_tensors="pt").input_ids[0].tolist()
+            )
+        self.tokenphrases = tokenphrases
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        for tokenphrase in self.tokenphrases:
+            if tokenphrase == input_ids[0].tolist()[-len(tokenphrase):]:
+                return True
+def generate(
+    prompt,
+    max_new_tokens,
+    avoid_hallucinated_prompts,
+    sampling,
+    temperature,
+    top_k,
+    top_p,
+):
     inputs = tokenizer(prompt, return_tensors="pt")
     # thanks https://huggingface.co/spaces/joaogante/transformers_streaming/blob/main/app.py
+    streamer = TextIteratorStreamer(tokenizer)
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
+        do_sample=sampling,
+        stopping_criteria=[Phi2StoppingCriteria()]
+        if avoid_hallucinated_prompts
+        else None,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
             label="prompt",
             value="Write a detailed analogy between mathematics and a lighthouse.",
         ),
+        gr.Slider(minimum=0, maximum=500, step=1, value=100, label="max new tokens"),
+        gr.Checkbox(
+            value=True,
+            label="avoid hallucinated prompts",
+            info="stop generation after getting tokens like 'Exercise' or '<|endoftext|>, but will not remove them.",
+        ),
+        gr.Checkbox(
+            label="do sampling",
+            info="introduce randomness for non-deterministic results. required for below options",
+            value=True,
+        ),
+        gr.Slider(
+            label="temperature",
+            info="higher temperature means more randomness",
+            value=1.0,
+            minimum=0.1,
+            maximum=1.5,
+            step=0.1,
+        ),
+        gr.Slider(
+            label="top-k",
+            info="consider only the k most likely tokens",
+            value=50,
+            minimum=1,
+            maximum=100,
+            step=1,
+        ),
+        gr.Slider(
+            label="top-p",
+            info="choose from the smallest possible set of words whose cumulative probability exceeds the probability p",
+            value=1.0,
+            minimum=0.1,
+            maximum=1.0,
+            step=0.1,
+        ),
     ],
     outputs="text",
     examples=[
    """\n''',
             100,
         ],
+        ["User: How does sleep affect mood?\nAI:", 75],
         ["Who was Ada Lovelace?", 100],
         ["Explain the concept of skip lists.", 125],
     ],

requirements.txt CHANGED Viewed

@@ -1,20 +1,5 @@
-mlflow==2.6.0
-cloudpickle==2.2.1
-jsonpickle==3.0.1
-mlflow-skinny==2.6.0
-azureml-core==1.51.0.post1
-azureml-mlflow==1.51.0
-azureml-metrics[all]==0.0.32
 scikit-learn==1.2.2
-cryptography==41.0.1
-python-dateutil==2.8.2
-datasets==2.14.6
-soundfile==0.12.1
-librosa==0.10.1
 diffusers==0.21.4
-sentencepiece==0.1.99
 transformers==4.34.0
 accelerate==0.23.0
-Pillow==9.4.0
 einops
-azureml-evaluate-mlflow==0.0.32

 scikit-learn==1.2.2
 diffusers==0.21.4
 transformers==4.34.0
 accelerate==0.23.0
 einops