webpluging

Paused

App Files Files Community

ranamhamoud commited on Apr 18

Commit

4d5d8af

•

1 Parent(s): 85f58d4

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -56

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import re
-import logging
 import torch
 from threading import Thread
 from typing import Iterator
@@ -12,7 +11,7 @@ from peft import PeftModel
 # Constants
 MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 930
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 LICENSE = """
@@ -21,92 +20,120 @@ As a derivative work of [Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
 """
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if  torch.cuda.is_available():
-    modelA_id = "meta-llama/Llama-2-7b-hf"
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=False,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    base_model = AutoModelForCausalLM.from_pretrained(modelA_id, device_map="auto", quantization_config=bnb_config)
-    modelA = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
-    tokenizerA = AutoTokenizer.from_pretrained(modelA_id)
-    tokenizerA.pad_token = tokenizerA.eos_token
-    modelB_id = "meta-llama/Llama-2-7b-chat-hf"
-    modelB = AutoModelForCausalLM.from_pretrained(modelB_id, torch_dtype=torch.float16, device_map="auto")
-    tokenizerB = AutoTokenizer.from_pretrained(modelB_id)
-    tokenizerB.use_default_system_prompt = False
-    tokenizerB.pad_token = tokenizerB.eos_token
 def make_prompt(entry):
     return  f"### Human: Don't repeat the assesments, limit to 500 words {entry} ### Assistant:"
 @spaces.GPU
 def generate(
-    model: str,
     message: str,
     chat_history: list[tuple[str, str]],
-    max_new_tokens: int = 1024,
-    # temperature: float = 0.6,
-    # top_p: float = 0.9,
-    # top_k: int = 50,
-    # repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
-    if chat_history is None:
-        logging.error("chat_history is None, initializing to empty list.")
-        chat_history = []  # Initialize to an empty list if None is passed
     conversation = []
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    conversation.append({"role": "user", "content": message})
-    if model == "A":
-        model = modelA
-        tokenizer = tokenizerA
-    else:
-        model = modelB
-        tokenizer = tokenizerB
     enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
-    input_ids = enc.input_ids.to(model.device)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        # top_p=top_p,
-        # top_k=top_k,
-        # temperature=temperature,
-        # num_beams=1,
-        # repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
-        outputs.append(text)
-        yield "".join(outputs)
-logging.basicConfig(level=logging.DEBUG)
 # Gradio Interface Setup
 chat_interface = gr.ChatInterface(
     fn=generate,
-    additional_inputs=[gr.Dropdown(["A", "B"],label="Model", info="Will add more animals later!")],
     fill_height=True,
     stop_btn=None,
     examples=[
@@ -118,13 +145,12 @@ chat_interface = gr.ChatInterface(
 )
 # Gradio Web Interface
-with gr.Blocks(theme='shivi/calm_seafoam',fill_height=True) as demo:
-    # gr.Markdown(DESCRIPTION)
     chat_interface.render()
-    gr.Markdown(LICENSE)
 # Main Execution
 if __name__ == "__main__":
     demo.queue(max_size=20)
-    demo.launch(share=True)

 import os
 import re
 import torch
 from threading import Thread
 from typing import Iterator
 # Constants
 MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 LICENSE = """
 this demo is governed by the original [license](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/LICENSE.txt) and [acceptable use policy](https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat/blob/main/USE_POLICY.md).
 """
+# GPU Check and add CPU warning
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+if torch.cuda.is_available():
+# Model and Tokenizer Configuration
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=False,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=bnb_config)
+model = PeftModel.from_pretrained(base_model, "ranamhamoud/storytell")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = tokenizer.eos_token
+# # MongoDB Connection
+# PASSWORD = os.environ.get("MONGO_PASS")
+# connect(host=f"mongodb+srv://ranamhammoud11:{PASSWORD}@stories.zf5v52a.mongodb.net/")
+# # MongoDB Document
+# class Story(Document):
+#     message = StringField()
+#     content = StringField()
+#     story_id = SequenceField(primary_key=True)
+# Utility function for prompts
 def make_prompt(entry):
     return  f"### Human: Don't repeat the assesments, limit to 500 words {entry} ### Assistant:"
+    # f"TELL A STORY, RELATE TO COMPUTER SCIENCE, INCLUDE ASSESMENTS. MAKE IT REALISTIC AND AROUND 800 WORDS, END THE STORY WITH "THE END.": {entry}"
+def process_text(text):
+    # First, handle the specific case for [answer:]
+    # This replaces [answer:] with "Answer:" and keeps the content after it on the same line.
+    text = re.sub(r'\[answer:\]\s*', 'Answer: ', text)
+    # Now, remove all other content within brackets.
+    # This regex looks for square brackets and any content inside them, excluding those that start with "Answer: " already modified.
+    text = re.sub(r'\[.*?\](?<!Answer: )', '', text)
+    return text
+custom_css = """
+body, input, button, textarea, label {
+    font-family: Arial, sans-serif;
+    font-size: 24px;
+}
+.gr-chat-interface .gr-chat-message-container {
+    font-size: 14px;
+}
+.gr-button {
+    font-size: 14px;
+    padding: 12px 24px;
+}
+.gr-input {
+    font-size: 14px;
+}
+"""
+# Gradio Function
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    temperature: float = 0.6,
+    top_p: float = 0.7,
+    top_k: int = 20,
+    repetition_penalty: float = 1.0,
 ) -> Iterator[str]:
     conversation = []
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": make_prompt(message)})
     enc = tokenizer(make_prompt(message), return_tensors="pt", padding=True, truncation=True)
+    input_ids = enc.input_ids.to(model.device)
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=False)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
+        processed_text = process_text(text)
+        outputs.append(processed_text)
+        output = "".join(outputs)
+        yield output
+    # final_story = "".join(outputs)
+    # try:
+    #     saved_story = Story(message=message, content=final_story).save()
+    #     yield f"{final_story}\n\n Story saved with ID: {saved_story.story_id}"
+    # except Exception as e:
+    #     yield f"Failed to save story: {str(e)}"
 # Gradio Interface Setup
 chat_interface = gr.ChatInterface(
     fn=generate,
     fill_height=True,
     stop_btn=None,
     examples=[
 )
 # Gradio Web Interface
+with gr.Blocks(css=custom_css,theme='shivi/calm_seafoam',fill_height=True) as demo:
     chat_interface.render()
+    # gr.Markdown(LICENSE)
 # Main Execution
 if __name__ == "__main__":
     demo.queue(max_size=20)
+    demo.launch(share=True)