Cicero-semantic-search-merged-v1

Sleeping

App Files Files Community

Rams901 commited on Sep 15, 2023

Commit

444fbc0

•

1 Parent(s): 00b67ab

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -40

app.py CHANGED Viewed

@@ -21,27 +21,11 @@ from langchain.llms.base import LLM
 from typing import Optional, List, Mapping, Any
 import ast
-from utils import ClaudeLLM, extract_website_name, remove_numbers
 embeddings = HuggingFaceEmbeddings()
 db = FAISS.load_local('db_full', embeddings)
 mp_docs = {}
-# llm = ClaudeLLM()
-# ChatOpenAI(
-#             temperature=0,
-#             model='gpt-3.5-turbo-16k'
-#         )
-def add_text(history, text):
-    print(history)
-    history = history + [(text, None)]
-    return history, ""
-# pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}
 def retrieve_thoughts(query, n):
@@ -50,40 +34,29 @@ def retrieve_thoughts(query, n):
     df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
     df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
     df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
     df.sort_values("score", inplace = True)
   # TO-DO: What if user query doesn't match what we provide as documents
     tier_1 = df[df['score'] < 1]
-    # tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
-    chunks_1 = tier_1.groupby(['title', 'url', ]).apply(lambda x: {f"chunk_{i}": row for i, row  in enumerate(x.sort_values('id')[['score','page_content']].to_dict('records'))}).values
-    tier_1_adjusted = tier_1.groupby(['title', 'url']).first().reset_index()[['title', 'url', 'score']]
     tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
     tier_1_adjusted['chunks'] = chunks_1
-    score = tier_1.groupby(['title', 'url', ]).apply(lambda x: x['score'].mean()).values
     tier_1_adjusted['score'] = score
     tier_1_adjusted.sort_values("score", inplace = True)
-    # chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
-    # tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
-    # tier_2_adjusted['content'] = chunks_2
     if n:
       tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
-    print(len(tier_1_adjusted))
-  # tier_1 = [doc[0]  for doc in docs if ((doc[1] < 1))][:5]
-  # tier_2 = [doc[0]  for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
     return {'tier 1':tier_1_adjusted, }
 def qa_retrieve(query, llm):
-    # llm = pipeline["claude"][0]
     docs = ""
     global db
@@ -99,13 +72,8 @@ def qa_retrieve(query, llm):
         mp_docs = thoughts
     tier_1 = thoughts['tier 1']
-    # tier_2 = thoughts['tier 2']
-    reference = tier_1[['ref', 'url', 'title', 'chunks']].to_dict('records')
-    # tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
-    # print(len(tier_1))
-    # tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
     return {'Reference': reference}
@@ -123,5 +91,4 @@ demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
                              gr.components.JSON( label="Reference")],examples=examples)
 demo.queue(concurrency_count = 4)
-demo.launch()

 from typing import Optional, List, Mapping, Any
 import ast
+from utils import ClaudeLLM
 embeddings = HuggingFaceEmbeddings()
 db = FAISS.load_local('db_full', embeddings)
 mp_docs = {}
 def retrieve_thoughts(query, n):
     df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
     df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
     df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
+    df['_id'] = df['_id'].apply(lambda x: str(x))
     df.sort_values("score", inplace = True)
   # TO-DO: What if user query doesn't match what we provide as documents
     tier_1 = df[df['score'] < 1]
+    chunks_1 = tier_1.groupby(['_id' ]).apply(lambda x: {f"chunk_{i}": row for i, row  in enumerate(x.sort_values('id')[['id', 'score','page_content']].to_dict('records'))}).values
+    tier_1_adjusted = tier_1.groupby(['_id']).first().reset_index()[['_id', 'title', 'url', 'score']]
     tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
     tier_1_adjusted['chunks'] = chunks_1
+    score = tier_1.groupby(['_id' ]).apply(lambda x: x['score'].mean()).values
     tier_1_adjusted['score'] = score
     tier_1_adjusted.sort_values("score", inplace = True)
     if n:
       tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
     return {'tier 1':tier_1_adjusted, }
 def qa_retrieve(query, llm):
     docs = ""
     global db
         mp_docs = thoughts
     tier_1 = thoughts['tier 1']
+    reference = tier_1[['_id', 'url', 'title', 'chunks', 'score']].to_dict('records')
     return {'Reference': reference}
                              gr.components.JSON( label="Reference")],examples=examples)
 demo.queue(concurrency_count = 4)
+demo.launch()