Spaces:

namoopsoo
/

text-audio-video-policies

Runtime error

App Files Files Community

michal commited on Feb 3, 2023

Commit

272ec4b

1 Parent(s): 6f03bef

refactor

Browse files

Files changed (2) hide show

app.py +4 -104
wiki_funcs.py +70 -0

app.py CHANGED Viewed

@@ -43,111 +43,12 @@ from datasets import load_dataset
 from greg_funcs import get_llm_response
-"""# import models"""
-bi_encoder = SentenceTransformer(
-    'sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
-bi_encoder.max_seq_length = 256  # Truncate long passages to 256 tokens
-# The bi-encoder will retrieve top_k documents. We use a cross-encoder, to re-rank the results list to improve the quality
-cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-"""# import datasets"""
-dataset = load_dataset("gfhayworth/hack_policy", split='train')
-mypassages = list(dataset.to_pandas()['psg'])
-dataset_embed = load_dataset("gfhayworth/hack_policy_embed", split='train')
-dataset_embed_pd = dataset_embed.to_pandas()
-mycorpus_embeddings = torch_tensor(dataset_embed_pd.values)
-def search(query, top_k=20, top_n=1):
-    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
-    hits = util.semantic_search(
-        question_embedding, mycorpus_embeddings, top_k=top_k)
-    hits = hits[0]  # Get the hits for the first query
-    ##### Re-Ranking #####
-    cross_inp = [[query, mypassages[hit['corpus_id']]] for hit in hits]
-    cross_scores = cross_encoder.predict(cross_inp)
-    # Sort results by the cross-encoder scores
-    for idx in range(len(cross_scores)):
-        hits[idx]['cross-score'] = cross_scores[idx]
-    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
-    predictions = hits[:top_n]
-    return predictions
-    # for hit in hits[0:3]:
-    #     print("\t{:.3f}\t{}".format(hit['cross-score'], mypassages[hit['corpus_id']].replace("\n", " ")))
-def get_text(qry):
-    # predictions = greg_search(qry)
-    predictions = search(qry)
-    prediction_text = []
-    for hit in predictions:
-        prediction_text.append("{}".format(mypassages[hit['corpus_id']]))
-    return prediction_text
-@tool
-def mysearch(query: str) -> str:
-    """Query our own datasets.
-    """
-    rslt = get_text(query)
-    return '\n'.join(rslt)
-@tool
-def mygreetings(greeting: str) -> str:
-    """Let us do our greetings
-    """
-    return "how are you?"
-# mysearch("who is the best rapper in the world?")
-# """# chat example"""
-# def chat(message, history):
-#     history = history or []
-#     message = message.lower()
-#     responses = get_text(message)
-#     for response in responses:
-#         history.append((message, response))
-#     return history, history
-# with gr.Blocks(css=CSS) as demo:
-#     history_state = gr.State()
-#     gr.Markdown('# WikiBot')
-#     title = 'Wikipedia Chatbot'
-#     description = 'chatbot with search on Wikipedia'
-#     with gr.Row():
-#         chatbot = gr.Chatbot()
-#     with gr.Row():
-#         message = gr.Textbox(label='Input your question here:',
-#                              placeholder='How many countries are in Europe?',
-#                              lines=1)
-#         submit = gr.Button(value='Send',
-#                            variant='secondary').style(full_width=False)
-#     submit.click(chat,
-#                  inputs=[message, history_state],
-#                  outputs=[chatbot, history_state])
-#     gr.Examples(
-#         examples=["How many countries are in Europe?",
-#                   "Was Roman Emperor Constantine I a Christian?",
-#                   "Who is the best rapper in the world?"],
-#         inputs=message
-#     )
-# demo.launch()
-OPENAI_API_KEY = "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"
 # AWS keys
 aws_access_key_id = "AKIA3JRWKI2EE5ZFN5NZ"
@@ -160,7 +61,7 @@ os.environ["AWS_DEFAULT_REGION"] = aws_region_name
 # exhumana api key
 # todo: may need to pay to get one
-os.environ['EXHUMAN_API_KEY'] = ''
 # news, tmdb keys
 os.environ["NEWS_API_KEY"] = ''
@@ -171,7 +72,6 @@ tmdb_bearer_token = os.environ["TMDB_BEARER_TOKEN"]
 TOOLS_LIST = ['serpapi', 'wolfram-alpha', 'pal-math', 'pal-colored-objects', 'news-api', 'tmdb-api',
               'open-meteo-api']  # 'google-search'
-# TOOLS_DEFAULT_LIST = ['mysearch', 'serpapi', 'pal-math']
 TOOLS_DEFAULT_LIST = ['mysearch']
 BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
 AUTH_ERR_MSG = "Please paste your OpenAI key from openai.com to use this application. It is not necessary to hit a button or key after pasting it."

 from greg_funcs import get_llm_response
+from wiki_funcs import mysearch, mygreetings
+# OPENAI_API_KEY = "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"
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # "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"
 # AWS keys
 aws_access_key_id = "AKIA3JRWKI2EE5ZFN5NZ"
 # exhumana api key
 # todo: may need to pay to get one
+os.environ['EXHUMAN_API_KEY'] = ''  # XXX remove, we are not using the talking head because it costs money and doesnt work.
 # news, tmdb keys
 os.environ["NEWS_API_KEY"] = ''
 TOOLS_LIST = ['serpapi', 'wolfram-alpha', 'pal-math', 'pal-colored-objects', 'news-api', 'tmdb-api',
               'open-meteo-api']  # 'google-search'
 TOOLS_DEFAULT_LIST = ['mysearch']
 BUG_FOUND_MSG = "Congratulations, you've found a bug in this application!"
 AUTH_ERR_MSG = "Please paste your OpenAI key from openai.com to use this application. It is not necessary to hit a button or key after pasting it."

wiki_funcs.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from langchain.agents import tool
+from torch import tensor as torch_tensor
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+"""# import models"""
+bi_encoder = SentenceTransformer(
+    'sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
+bi_encoder.max_seq_length = 256  # Truncate long passages to 256 tokens
+# The bi-encoder will retrieve top_k documents. We use a cross-encoder, to re-rank the results list to improve the quality
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+"""# import datasets"""
+dataset = load_dataset("gfhayworth/wiki_mini", split='train')
+mypassages = list(dataset.to_pandas()['psg'])
+dataset_embed = load_dataset("gfhayworth/wiki_mini_embed", split='train')
+dataset_embed_pd = dataset_embed.to_pandas()
+mycorpus_embeddings = torch_tensor(dataset_embed_pd.values)
+def search(query, top_k=20, top_n=1):
+    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    hits = util.semantic_search(
+        question_embedding, mycorpus_embeddings, top_k=top_k)
+    hits = hits[0]  # Get the hits for the first query
+    ##### Re-Ranking #####
+    cross_inp = [[query, mypassages[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    predictions = hits[:top_n]
+    return predictions
+    # for hit in hits[0:3]:
+    #     print("\t{:.3f}\t{}".format(hit['cross-score'], mypassages[hit['corpus_id']].replace("\n", " ")))
+def get_text(qry):
+    # predictions = greg_search(qry)
+    predictions = search(qry)
+    prediction_text = []
+    for hit in predictions:
+        prediction_text.append("{}".format(mypassages[hit['corpus_id']]))
+    return prediction_text
+@tool
+def mysearch(query: str) -> str:
+    """Query our own datasets.
+    """
+    rslt = get_text(query)
+    return '\n'.join(rslt)
+@tool
+def mygreetings(greeting: str) -> str:
+    """Let us do our greetings
+    """
+    return "how are you?"