Spaces:

huggingface-projects
/

llama-2-7b-chat

Running on Zero

App Files Files

xet

Community

Update app.py

#58

by dzz5181 - opened Aug 6, 2024

base: refs/heads/main

←

from: refs/pr/58

Discussion Files changed

+49

-63

Files changed (1) hide show

app.py +49 -63

app.py CHANGED Viewed

@@ -7,14 +7,23 @@ import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
-# Llama-2 7B Chat
-This Space demonstrates model [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta, a Llama 2 model with 7B parameters fine-tuned for chat instructions. Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
 🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
@@ -32,13 +41,32 @@ this demo is governed by the original [license](https://huggingface.co/spaces/hu
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 if torch.cuda.is_available():
-    model_id = "meta-llama/Llama-2-7b-chat-hf"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
 @spaces.GPU
 def generate(
@@ -84,63 +112,21 @@ def generate(
         outputs.append(text)
         yield "".join(outputs)
-chat_interface = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Textbox(label="System prompt", lines=6),
-        gr.Slider(
-            label="Max new tokens",
-            minimum=1,
-            maximum=MAX_MAX_NEW_TOKENS,
-            step=1,
-            value=DEFAULT_MAX_NEW_TOKENS,
-        ),
-        gr.Slider(
-            label="Temperature",
-            minimum=0.1,
-            maximum=4.0,
-            step=0.1,
-            value=0.6,
-        ),
-        gr.Slider(
-            label="Top-p (nucleus sampling)",
-            minimum=0.05,
-            maximum=1.0,
-            step=0.05,
-            value=0.9,
-        ),
-        gr.Slider(
-            label="Top-k",
-            minimum=1,
-            maximum=1000,
-            step=1,
-            value=50,
-        ),
-        gr.Slider(
-            label="Repetition penalty",
-            minimum=1.0,
-            maximum=2.0,
-            step=0.05,
-            value=1.2,
-        ),
-    ],
-    stop_btn=None,
-    examples=[
-        ["Hello there! How are you doing?"],
-        ["Can you explain briefly to me what is the Python programming language?"],
-        ["Explain the plot of Cinderella in a sentence."],
-        ["How many hours does it take a man to eat a Helicopter?"],
-        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
-    ],
-    cache_examples=False,
 )
-with gr.Blocks(css="style.css", fill_height=True) as demo:
-    gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
-    chat_interface.render()
-    gr.Markdown(LICENSE)
-if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from llama_index.core.prompts.prompts import SimpleInputPrompt
+from llama_index.llms.huggingface import HuggingFaceLLM
+from llama_index.legacy.embeddings.langchain import LangchainEmbedding
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from llama_index.core import set_global_service_context, ServiceContext, VectorStoreIndex, Document
+from pathlib import Path
+import fitz  # PyMuPDF
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
+# Llama-2 7B Chat with Document Context
+This Space demonstrates model [Llama-2-7b-chat](https://huggingface.co/meta-llama/Llama-2-7b-chat) by Meta, a Llama 2 model with 7B parameters fine-tuned for chat instructions, now enhanced with document-based context.
+Feel free to play with it, or duplicate to run generations without a queue! If you want to run your own service, you can also [deploy the model on Inference Endpoints](https://huggingface.co/inference-endpoints).
 🔎 For more details about the Llama 2 family of models and how to use them with `transformers`, take a look [at our blog post](https://huggingface.co/blog/llama2).
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 if torch.cuda.is_available():
+    model_name = "meta-llama/Llama-2-7b-chat-hf"
+    token_file = open("HF_TOKEN.txt")
+    auth_token = token_file.readline().strip()
+    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto", token=auth_token)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./model/', token=auth_token)
     tokenizer.use_default_system_prompt = False
+    # Load documents and create the index
+    def read_pdf_to_documents(file_path):
+        doc = fitz.open(file_path)
+        documents = []
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            text = page.get_text()
+            documents.append(Document(text=text))
+        return documents
+    file_path = Path('/content/Full_Pamplet.pdf')  # Update with your document path
+    documents = read_pdf_to_documents(file_path)
+    embeddings = LangchainEmbedding(HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))
+    service_context = ServiceContext.from_defaults(chunk_size=1024, embed_model=embeddings)
+    set_global_service_context(service_context)
+    index = VectorStoreIndex.from_documents(documents)
+    query_engine = index.as_query_engine()
 @spaces.GPU
 def generate(
         outputs.append(text)
         yield "".join(outputs)
+def query_model(question):
+    response = query_engine.query(question)
+    return response.response
+update_prompt_interface = gr.Interface(
+    fn=update_system_prompt,
+    inputs=gr.Textbox(lines=5, placeholder="Enter the system prompt here...", label="System Prompt", value=system_prompt),
+    outputs=gr.Textbox(label="Status"),
+    title="System Prompt Updater",
+    description="Update the system prompt used for context."
 )
+query_interface = gr.Interface(
+    fn=query_model,
+    inputs=gr.Textbox(lines=2, placeholder="Enter your question here...", label="User Question"),
+    outputs=gr.Textbox(label="Response"),
+    title="Document Query Assistant",
+    description="Ask questions based on the conte