Rams901 commited on
Commit
444fbc0
1 Parent(s): 00b67ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -40
app.py CHANGED
@@ -21,27 +21,11 @@ from langchain.llms.base import LLM
21
  from typing import Optional, List, Mapping, Any
22
 
23
  import ast
24
- from utils import ClaudeLLM, extract_website_name, remove_numbers
25
 
26
  embeddings = HuggingFaceEmbeddings()
27
  db = FAISS.load_local('db_full', embeddings)
28
-
29
  mp_docs = {}
30
- # llm = ClaudeLLM()
31
- # ChatOpenAI(
32
- # temperature=0,
33
- # model='gpt-3.5-turbo-16k'
34
- # )
35
-
36
-
37
- def add_text(history, text):
38
-
39
- print(history)
40
- history = history + [(text, None)]
41
-
42
- return history, ""
43
-
44
- # pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}
45
 
46
  def retrieve_thoughts(query, n):
47
 
@@ -50,40 +34,29 @@ def retrieve_thoughts(query, n):
50
  df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
51
  df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
52
  df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
 
53
  df.sort_values("score", inplace = True)
54
 
55
  # TO-DO: What if user query doesn't match what we provide as documents
56
 
57
  tier_1 = df[df['score'] < 1]
58
 
59
- # tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
60
 
61
-
62
- chunks_1 = tier_1.groupby(['title', 'url', ]).apply(lambda x: {f"chunk_{i}": row for i, row in enumerate(x.sort_values('id')[['score','page_content']].to_dict('records'))}).values
63
- tier_1_adjusted = tier_1.groupby(['title', 'url']).first().reset_index()[['title', 'url', 'score']]
64
  tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
65
  tier_1_adjusted['chunks'] = chunks_1
66
- score = tier_1.groupby(['title', 'url', ]).apply(lambda x: x['score'].mean()).values
67
  tier_1_adjusted['score'] = score
68
  tier_1_adjusted.sort_values("score", inplace = True)
69
 
70
- # chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
71
- # tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
72
- # tier_2_adjusted['content'] = chunks_2
73
-
74
  if n:
75
  tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
76
 
77
- print(len(tier_1_adjusted))
78
- # tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
79
- # tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
80
-
81
  return {'tier 1':tier_1_adjusted, }
82
 
83
  def qa_retrieve(query, llm):
84
 
85
- # llm = pipeline["claude"][0]
86
-
87
  docs = ""
88
 
89
  global db
@@ -99,13 +72,8 @@ def qa_retrieve(query, llm):
99
  mp_docs = thoughts
100
 
101
  tier_1 = thoughts['tier 1']
102
- # tier_2 = thoughts['tier 2']
103
 
104
- reference = tier_1[['ref', 'url', 'title', 'chunks']].to_dict('records')
105
-
106
- # tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
107
- # print(len(tier_1))
108
- # tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
109
 
110
  return {'Reference': reference}
111
 
@@ -123,5 +91,4 @@ demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
123
  gr.components.JSON( label="Reference")],examples=examples)
124
 
125
  demo.queue(concurrency_count = 4)
126
- demo.launch()
127
-
 
21
  from typing import Optional, List, Mapping, Any
22
 
23
  import ast
24
+ from utils import ClaudeLLM
25
 
26
  embeddings = HuggingFaceEmbeddings()
27
  db = FAISS.load_local('db_full', embeddings)
 
28
  mp_docs = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def retrieve_thoughts(query, n):
31
 
 
34
  df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
35
  df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
36
  df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
37
+ df['_id'] = df['_id'].apply(lambda x: str(x))
38
  df.sort_values("score", inplace = True)
39
 
40
  # TO-DO: What if user query doesn't match what we provide as documents
41
 
42
  tier_1 = df[df['score'] < 1]
43
 
 
44
 
45
+ chunks_1 = tier_1.groupby(['_id' ]).apply(lambda x: {f"chunk_{i}": row for i, row in enumerate(x.sort_values('id')[['id', 'score','page_content']].to_dict('records'))}).values
46
+ tier_1_adjusted = tier_1.groupby(['_id']).first().reset_index()[['_id', 'title', 'url', 'score']]
 
47
  tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
48
  tier_1_adjusted['chunks'] = chunks_1
49
+ score = tier_1.groupby(['_id' ]).apply(lambda x: x['score'].mean()).values
50
  tier_1_adjusted['score'] = score
51
  tier_1_adjusted.sort_values("score", inplace = True)
52
 
 
 
 
 
53
  if n:
54
  tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
55
 
 
 
 
 
56
  return {'tier 1':tier_1_adjusted, }
57
 
58
  def qa_retrieve(query, llm):
59
 
 
 
60
  docs = ""
61
 
62
  global db
 
72
  mp_docs = thoughts
73
 
74
  tier_1 = thoughts['tier 1']
 
75
 
76
+ reference = tier_1[['_id', 'url', 'title', 'chunks', 'score']].to_dict('records')
 
 
 
 
77
 
78
  return {'Reference': reference}
79
 
 
91
  gr.components.JSON( label="Reference")],examples=examples)
92
 
93
  demo.queue(concurrency_count = 4)
94
+ demo.launch()