Spaces:

yonikremer
/

grouped-sampling-demo

Paused

App Files Files Community

yonikremer commited on Mar 6, 2023

Commit

a671856

1 Parent(s): 6e4f775

removed the option to search the web

Browse files

Files changed (5) hide show

app.py +0 -7
hanlde_form_submit.py +1 -11
prompt_engeneering.py +0 -86
tests.py +3 -20
user_instructions_hebrew.md +25 -15

app.py CHANGED Viewed

@@ -39,12 +39,6 @@ with st.form("request_form"):
         max_chars=2048,
     )
-    web_search: bool = st.checkbox(
-        label="Web search",
-        value=True,
-        help="If checked, the model will get your prompt as well as some web search results."
-    )
     submitted: bool = st.form_submit_button(
         label="Generate",
         help="Generate the output text.",
@@ -57,7 +51,6 @@ with st.form("request_form"):
                 selected_model_name,
                 output_length,
                 submitted_prompt,
-                web_search,
             )
         except CudaError as e:
             st.error("Out of memory. Please try a smaller model, shorter prompt, or a smaller output length.")

         max_chars=2048,
     )
     submitted: bool = st.form_submit_button(
         label="Generate",
         help="Generate the output text.",
                 selected_model_name,
                 output_length,
                 submitted_prompt,
             )
         except CudaError as e:
             st.error("Out of memory. Please try a smaller model, shorter prompt, or a smaller output length.")

hanlde_form_submit.py CHANGED Viewed

@@ -5,7 +5,6 @@ import streamlit as st
 from grouped_sampling import GroupedSamplingPipeLine, is_supported, UnsupportedModelNameException
 from download_repo import download_pytorch_model
-from prompt_engeneering import rewrite_prompt
 def is_downloaded(model_name: str) -> bool:
@@ -51,22 +50,16 @@ def generate_text(
         pipeline: GroupedSamplingPipeLine,
         prompt: str,
         output_length: int,
-        web_search: bool,
 ) -> str:
     """
     Generates text using the given pipeline.
     :param pipeline: The pipeline to use. GroupedSamplingPipeLine.
     :param prompt: The prompt to use. str.
     :param output_length: The size of the text to generate in tokens. int > 0.
-    :param web_search: Whether to use web search or not. bool.
     :return: The generated text. str.
     """
-    if web_search:
-        better_prompt = rewrite_prompt(prompt)
-    else:
-        better_prompt = prompt
     return pipeline(
-        prompt_s=better_prompt,
         max_new_tokens=output_length,
         return_text=True,
         return_full_text=False,
@@ -77,14 +70,12 @@ def on_form_submit(
         model_name: str,
         output_length: int,
         prompt: str,
-        web_search: bool
 ) -> str:
     """
     Called when the user submits the form.
     :param model_name: The name of the model to use.
     :param output_length: The size of the groups to use.
     :param prompt: The prompt to use.
-    :param web_search: Whether to use web search or not.
     :return: The output of the model.
     :raises ValueError: If the model name is not supported, the output length is <= 0,
      the prompt is empty or longer than
@@ -111,7 +102,6 @@ def on_form_submit(
         pipeline=pipeline,
         prompt=prompt,
         output_length=output_length,
-        web_search=web_search,
     )
     generation_end_time = time()
     generation_time = generation_end_time - generation_start_time

 from grouped_sampling import GroupedSamplingPipeLine, is_supported, UnsupportedModelNameException
 from download_repo import download_pytorch_model
 def is_downloaded(model_name: str) -> bool:
         pipeline: GroupedSamplingPipeLine,
         prompt: str,
         output_length: int,
 ) -> str:
     """
     Generates text using the given pipeline.
     :param pipeline: The pipeline to use. GroupedSamplingPipeLine.
     :param prompt: The prompt to use. str.
     :param output_length: The size of the text to generate in tokens. int > 0.
     :return: The generated text. str.
     """
     return pipeline(
+        prompt_s=prompt,
         max_new_tokens=output_length,
         return_text=True,
         return_full_text=False,
         model_name: str,
         output_length: int,
         prompt: str,
 ) -> str:
     """
     Called when the user submits the form.
     :param model_name: The name of the model to use.
     :param output_length: The size of the groups to use.
     :param prompt: The prompt to use.
     :return: The output of the model.
     :raises ValueError: If the model name is not supported, the output length is <= 0,
      the prompt is empty or longer than
         pipeline=pipeline,
         prompt=prompt,
         output_length=output_length,
     )
     generation_end_time = time()
     generation_time = generation_end_time - generation_start_time

prompt_engeneering.py DELETED Viewed

@@ -1,86 +0,0 @@
-import os
-from dataclasses import dataclass
-from datetime import datetime
-from typing import Generator, Dict, List
-from googleapiclient.discovery import build
-from streamlit import secrets
-INSTRUCTIONS = "Instructions: " \
-               "Using the provided web search results, " \
-               "write a comprehensive reply to the given query. " \
-               "Make sure to cite results using [[number](URL)] notation after the reference. " \
-               "If the provided search results refer to multiple subjects with the same name, " \
-               "write separate answers for each subject."
-def get_google_api_key():
-    """Returns the Google API key from streamlit's secrets"""
-    try:
-        return secrets["google_search_api_key"]
-    except (FileNotFoundError, IsADirectoryError):
-        return os.environ["google_search_api_key"]
-def get_google_cse_id():
-    """Returns the Google CSE ID from streamlit's secrets"""
-    try:
-        return secrets["google_cse_id"]
-    except (FileNotFoundError, IsADirectoryError):
-        return os.environ["google_cse_id"]
-def google_search(search_term, **kwargs) -> list:
-    service = build("customsearch", "v1", developerKey=get_google_api_key())
-    search_engine = service.cse()
-    res = search_engine.list(q=search_term, cx=get_google_cse_id(), **kwargs).execute()
-    return res['items']
-@dataclass
-class SearchResult:
-    __slots__ = ["title", "body", "url"]
-    title: str
-    body: str
-    url: str
-def get_web_search_results(
-        query: str,
-        num_results: int,
-) -> Generator[SearchResult, None, None]:
-    """Gets a list of web search results using the Google search API"""
-    rew_results: List[Dict[str, str]] = google_search(
-        search_term=query,
-        num=num_results
-    )[:num_results]
-    for result in rew_results:
-        if result["snippet"].endswith("\xa0..."):
-            result["snippet"] = result["snippet"][:-4]
-        yield SearchResult(
-            title=result["title"],
-            body=result["snippet"],
-            url=result["link"],
-        )
-def format_search_result(search_result: Generator[SearchResult, None, None]) -> str:
-    """Formats a search result to be added to the prompt."""
-    ans = ""
-    for i, result in enumerate(search_result):
-        ans += f"[{i}] {result.body}\nURL: {result.url}\n\n"
-    return ans
-def rewrite_prompt(
-        prompt: str,
-) -> str:
-    """Rewrites the prompt by adding web search results to it."""
-    raw_results = get_web_search_results(
-        query=prompt,
-        num_results=5,
-    )
-    formatted_results = "Web search results:\n" + format_search_result(raw_results)
-    formatted_date = "Current date: " + datetime.now().strftime("%d/%m/%Y")
-    formatted_prompt = f"Query: {prompt}"
-    return "\n".join([formatted_results, formatted_date, INSTRUCTIONS, formatted_prompt])

tests.py CHANGED Viewed

@@ -5,27 +5,10 @@ from grouped_sampling import GroupedSamplingPipeLine, get_full_models_list, Unsu
 from on_server_start import download_useful_models
 from hanlde_form_submit import create_pipeline, on_form_submit
-from prompt_engeneering import rewrite_prompt
 HUGGING_FACE_CACHE_DIR = "/home/yoni/.cache/huggingface/hub"
-def test_prompt_engineering():
-    example_prompt = "Answer yes or no, is the sky blue?"
-    rewritten_prompt = rewrite_prompt(example_prompt)
-    assert rewritten_prompt.startswith("Web search results:")
-    assert rewritten_prompt.endswith("Query: Answer yes or no, is the sky blue?")
-    assert "Current date: " in rewritten_prompt
-    assert "Instructions: " in rewritten_prompt
-def test_get_supported_model_names():
-    supported_model_names = get_full_models_list()
-    assert len(supported_model_names) > 0
-    assert "gpt2" in supported_model_names
-    assert all(isinstance(name, str) for name in supported_model_names)
 def test_on_server_start():
     download_useful_models()
     assert os.path.exists(HUGGING_FACE_CACHE_DIR)
@@ -36,15 +19,15 @@ def test_on_form_submit():
     model_name = "gpt2"
     output_length = 10
     prompt = "Answer yes or no, is the sky blue?"
-    output = on_form_submit(model_name, output_length, prompt, web_search=False)
     assert output is not None
     assert len(output) > 0
     empty_prompt = ""
     with pytest.raises(ValueError):
-        on_form_submit(model_name, output_length, empty_prompt, web_search=False)
     unsupported_model_name = "unsupported_model_name"
     with pytest.raises(UnsupportedModelNameException):
-        on_form_submit(unsupported_model_name, output_length, prompt, web_search=False)
 @pytest.mark.parametrize(

 from on_server_start import download_useful_models
 from hanlde_form_submit import create_pipeline, on_form_submit
 HUGGING_FACE_CACHE_DIR = "/home/yoni/.cache/huggingface/hub"
 def test_on_server_start():
     download_useful_models()
     assert os.path.exists(HUGGING_FACE_CACHE_DIR)
     model_name = "gpt2"
     output_length = 10
     prompt = "Answer yes or no, is the sky blue?"
+    output = on_form_submit(model_name, output_length, prompt)
     assert output is not None
     assert len(output) > 0
     empty_prompt = ""
     with pytest.raises(ValueError):
+        on_form_submit(model_name, output_length, empty_prompt)
     unsupported_model_name = "unsupported_model_name"
     with pytest.raises(UnsupportedModelNameException):
+        on_form_submit(unsupported_model_name, output_length, prompt)
 @pytest.mark.parametrize(

user_instructions_hebrew.md CHANGED Viewed

@@ -2,16 +2,27 @@
 בדמו הזה, אתם יכולים להשתמש בקלות בדגימה בקבוצות.
-תבחרו מודל שיש לו לפחות לייק אחד מתוך [המאגר](https://huggingface.co/models?pipeline_tag=text-generation&library=pytorch&sort=downloads)
-תכתבו למודל הוראות ברורות ותבחרו את אורך הטקסט שאתם רוצים שהמודל יצור.
-## עצות לכתיבת הפרומפט
-1. תשתמשו במודל המתאים - כל מודל נוצר למשימה מסוימת. תשאלו אותי איזה מודל לבחור למשימה שלכם
-2. תשקיעו מחשבה באורך הטקסט שאתם רוצים שהאלגוריתם יצור
-3. תזכרו שזה רק דמו, הדמו רץ על מעבד גרפי אחד בלבד ולכן לא ניתן להריץ בדמו את המודלים החזקים ביותר אלא רק מודלים קטנים ופומביים
-4. תכתבו באנגלית ולא בעברית. רוב המודלים לא תומכים בעברית כלל ואלה שכן, מגיעים לתוצאות הרבה יותר טובות באנגלית
-5. תגידו לאלגוריתם בדיוק מה לעשות - התחילו בהוראה והפרידו את החלקים
    לדוגמה:
@@ -21,7 +32,7 @@
    Answer: """
-6. השתמשו בתבניות עם דוגמות - לדוגמה:
    Instruction: """Label the following sentences to positive or negative"""
@@ -31,12 +42,11 @@
    Sentence: """your sentence here.""" Sentiment: """
-7. אל תסיימו את הפרומפט ברווח
-8. תשתמשו בחיפוש באינטרנט רק אם המידע באינטרנט הוא קריטי להצלחת המשימה שהטלתם על האלגוריתם. מידע לא רלוונטי יבלבל את האלגוריתם
-9. תעזרו לאלגוירתם - תספקו לאלגוריתם את כל המידע הנדרש
-10. במשימות מסובכות, התחילו את הפרומפט במשפט "Let's think step by step"
-11. תהיו ספיצפיים
-12. תתחילו את המשימה בעצמכם
     דוגמה:

 בדמו הזה, אתם יכולים להשתמש בקלות בדגימה בקבוצות.
+## שלב ראשון - תבחרו מודל
+ - המודלים בדמו הם
+1. opt-iml-max-1.3B (המודל הקטן)
+2. opt-iml-max-30B (המודל הגדול)
+המודל הגדול יוצר טקסטים יותר טובים, אך באיטיות.
+המודל הקטן יוצר טקסטים קצת פחות טובים אך במהירות.
+תכתבו לאלגוריתם הוראות ברורות ותבחרו את אורך הטקסט שאתם רוצים שהאלגוריתם יצור.
+## תסבירו לאלגוריתם מה לעשות
+### עצות לכתיבת הקלט למודל
+1. תזכרו שזה רק דמו, לא ניתן להריץ בדמו את המודלים החזקים ביותר אלא רק מודלים פתוחים לציבור.
+2. תכתבו באנגלית ולא בעברית. רוב המודלים לא תומכים בעברית כלל ואלה שכן, לא יוצרים טקסטים באיכות מספיק טובה ולכן הדמו לא תומך בהם.
+3. תשקיעו מחשבה באורך הטקסט שאתם רוצים שהאלגוריתם יצור.
+4. תגידו לאלגוריתם בדיוק מה לעשות - התחילו בהוראה והפרידו את החלקים
    לדוגמה:
    Answer: """
+5. השתמשו בתבניות עם דוגמות - לדוגמה:
    Instruction: """Label the following sentences to positive or negative"""
    Sentence: """your sentence here.""" Sentiment: """
+6. אל תסיימו את הקלט לאלגוריתם ברווח
+7. תעזרו לאלגוירתם - תספקו לאלגוריתם את כל המידע הנדרש
+8. במשימות מסובכות, התחילו את הקלט לאלגוריתם במשפט "Let's think step by step"
+9. תהיו ספיצפיים
+10. תתחילו את המשימה בעצמכם
     דוגמה: