Rams901 commited on
Commit
734db66
0 Parent(s):

Duplicate from Rams901/Cicero-QA-themes

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +192 -0
  4. db_full/index.faiss +3 -0
  5. db_full/index.pkl +3 -0
  6. requirements.txt +9 -0
  7. utils.py +49 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ db_full/index.faiss filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cicero Interactive QA Dev
3
+ emoji: 🏃
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.23.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: Rams901/Cicero-QA-themes
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.chains import LLMChain
5
+ from langchain import PromptTemplate
6
+ import re
7
+ import pandas as pd
8
+ from langchain.vectorstores import FAISS
9
+ import requests
10
+ from typing import List
11
+ from langchain.schema import (
12
+ SystemMessage,
13
+ HumanMessage,
14
+ AIMessage
15
+ )
16
+ import os
17
+ from langchain.embeddings import HuggingFaceEmbeddings
18
+ from langchain.chat_models import ChatOpenAI
19
+
20
+ from langchain.llms.base import LLM
21
+ from typing import Optional, List, Mapping, Any
22
+
23
+ import ast
24
+ from utils import ClaudeLLM, extract_website_name, remove_numbers
25
+
26
+ embeddings = HuggingFaceEmbeddings()
27
+ db = FAISS.load_local('db_full', embeddings)
28
+
29
+ mp_docs = {}
30
+ llm = ClaudeLLM()
31
+ # ChatOpenAI(
32
+ # temperature=0,
33
+ # model='gpt-3.5-turbo-16k'
34
+ # )
35
+
36
+
37
+ def add_text(history, text):
38
+
39
+ print(history)
40
+ history = history + [(text, None)]
41
+
42
+ return history, ""
43
+
44
+ pipeline = {'claude': (ClaudeLLM(), 0), 'gpt-3.5': (ChatOpenAI(temperature=0,model='gpt-3.5-turbo-16k'), 65), 'gpt-4': (ChatOpenAI(temperature=0, model='gpt-4'), 30)}
45
+
46
+ def retrieve_thoughts(query, n):
47
+
48
+ # print(db.similarity_search_with_score(query = query, k = k, fetch_k = k*10))
49
+ docs_with_score = db.similarity_search_with_score(query = query, k = len(db.index_to_docstore_id.values()), fetch_k = len(db.index_to_docstore_id.values()))
50
+ df = pd.DataFrame([dict(doc[0])['metadata'] for doc in docs_with_score], )
51
+ df = pd.concat((df, pd.DataFrame([dict(doc[0])['page_content'] for doc in docs_with_score], columns = ['page_content'])), axis = 1)
52
+ df = pd.concat((df, pd.DataFrame([doc[1] for doc in docs_with_score], columns = ['score'])), axis = 1)
53
+
54
+ # TO-DO: What if user query doesn't match what we provide as documents
55
+
56
+ tier_1 = df[df['score'] < 0.7]
57
+ tier_2 = df[(df['score'] < 0.95) * (df["score"] > 0.7)]
58
+
59
+
60
+ chunks_1 = tier_1.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
61
+ tier_1_adjusted = tier_1.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
62
+ tier_1_adjusted['ref'] = range(1, len(tier_1_adjusted) + 1 )
63
+ tier_1_adjusted['content'] = chunks_1
64
+
65
+ chunks_2 = tier_2.groupby(['title', 'url', '_id']).apply(lambda x: "\n...\n".join(x.sort_values('id')['page_content'].values)).values
66
+ tier_2_adjusted = tier_2.groupby(['title', 'url', '_id']).first().reset_index()[['_id', 'title', 'url']]
67
+ tier_2_adjusted['content'] = chunks_2
68
+
69
+ if n:
70
+ tier_1_adjusted = tier_1_adjusted[:min(len(tier_1_adjusted), n)]
71
+
72
+ print(len(tier_1_adjusted))
73
+ # tier_1 = [doc[0] for doc in docs if ((doc[1] < 1))][:5]
74
+ # tier_2 = [doc[0] for doc in docs if ((doc[1] > 0.7)*(doc[1] < 1.5))][10:15]
75
+
76
+ return {'tier 1':tier_1_adjusted, 'tier 2': tier_2.loc[:5]}
77
+
78
+ def qa_retrieve(query, llm):
79
+
80
+ llm = pipeline["claude"][0]
81
+
82
+ docs = ""
83
+
84
+ global db
85
+ print(db)
86
+
87
+ global mp_docs
88
+ thoughts = retrieve_thoughts(query, 0)
89
+ if not(thoughts):
90
+
91
+ if mp_docs:
92
+ thoughts = mp_docs
93
+ else:
94
+ mp_docs = thoughts
95
+
96
+ tier_1 = thoughts['tier 1']
97
+ tier_2 = thoughts['tier 2']
98
+
99
+ reference = tier_1[['ref', 'url', 'title']].to_dict('records')
100
+
101
+ tier_1 = list(tier_1.apply(lambda x: f"[{int(x['ref'])}] title: {x['title']}\n Content: {x.content}", axis = 1).values)
102
+ print(len(tier_1))
103
+ tier_2 = list(tier_2.apply(lambda x: f"title: {x['title']}\n Content: {x.content}", axis = 1).values)
104
+
105
+ print(f"QUERY: {query}\nTIER 1: {tier_1}\nTIER2: {tier_2}")
106
+ # print(f"DOCS RETRIEVED: {mp_docs.values}")
107
+
108
+ # Cynthesis Generation
109
+ session_prompt = """ A bot that is open to discussions about different cultural, philosophical and political exchanges. You will use do different analysis to the articles provided to me. Stay truthful and if you weren't provided any resources give your oppinion only."""
110
+ task = """Your primary responsibility is to identify multiple themes from the given articles. For each theme detected, you are to present it under three separate categories:
111
+
112
+ 1. Theme Title - An easy-to-understand title that encapsulates the core idea of the theme extracted from the article.
113
+
114
+ 2. Theme Description - An expanded elaboration that explores the theme in detail based on the arguments and points provided in the article.
115
+
116
+ 3. Quotes related to theme - Locate and provide at least one compelling quote from the article that directly supports or showcases the theme you have identified. This quote should serve as a specific evidence or example from the article text that corresponds directly to the developed theme.
117
+
118
+ The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles. """
119
+
120
+
121
+ prompt = PromptTemplate(
122
+ input_variables=["query", "task", "session_prompt", "articles"],
123
+ template="""
124
+ You are a {session_prompt}
125
+ {task}
126
+
127
+ query: {query}
128
+
129
+ Articles:
130
+ {articles}
131
+
132
+
133
+ The extracted themes should be written in structured manner, ensuring clarity and meaningful correlation between the themes and the articles. Make sure your analysis is rooted in the arguments given in the article. Avoid including personal opinions or making generalizations that are not explicitly supported by the articles.
134
+
135
+ """,
136
+ )
137
+
138
+
139
+ # llm = BardLLM()
140
+ chain = LLMChain(llm=llm, prompt = prompt)
141
+
142
+ response = chain.run(query=query, articles="\n".join(tier_1), session_prompt = session_prompt, task = task)
143
+
144
+ for i in range(5):
145
+ response = response.replace(f'[{i}]', f"<span class='text-primary'>[{i}]</span>")
146
+
147
+ # Generate related questions
148
+ prompt_q = PromptTemplate(
149
+ input_variables=[ "session_prompt", "articles"],
150
+ template="""
151
+ You are a {session_prompt}
152
+ Give general/global questions related the following articles:
153
+
154
+ Articles:
155
+ {articles}
156
+
157
+
158
+ Make sure not to ask specific questions, keep them general, short and concise.
159
+ """,
160
+ )
161
+
162
+ chain_q = LLMChain(llm=ClaudeLLM(), prompt = prompt_q)
163
+
164
+ questions = chain_q.run(session_prompt = session_prompt, articles = "\n".join(tier_2), )
165
+ print(questions)
166
+ questions = questions[questions.index('1'):]
167
+
168
+ questions = [ remove_numbers(t).strip() for (i, t) in enumerate(questions.split('.')) if len(t) > 5][:5]
169
+ print(questions)
170
+
171
+ # TO-DO: initiate models in another function, refactor code to be reusable
172
+
173
+ # json_resp = {'cynthesis': response, 'questions': questions, 'Reference': reference}
174
+
175
+ return response, {'Reference': reference}
176
+
177
+ def flush():
178
+ return None
179
+
180
+ examples = [
181
+ ["Will Russia win the war in Ukraine?"],
182
+
183
+ ]
184
+
185
+ demo = gr.Interface(fn=qa_retrieve, title="cicero-qa-api",
186
+ inputs=gr.inputs.Textbox(lines=5, label="what would you like to learn about?"),
187
+ outputs=[gr.components.Textbox(lines=3, label="Themes"),
188
+ gr.components.JSON( label="Reference")],examples=examples)
189
+
190
+ demo.queue(concurrency_count = 4)
191
+ demo.launch()
192
+
db_full/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9051c0122a839f58dc047ba2145fd887b64a33ecd746bd17aec950ca044f0653
3
+ size 354250797
db_full/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ccf07d4b39015b8e101152d341883a99d311f22ab7dfc5edba88277041b9179
3
+ size 102244751
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ langchain
3
+ openai
4
+ FAISS-gpu
5
+ tiktoken
6
+ transformers
7
+ sentence_transformers
8
+ bson
9
+ anthropic==0.2.10
utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.llms.base import LLM
2
+ from typing import Optional, List, Mapping, Any
3
+ import anthropic
4
+ from urllib.parse import urlparse
5
+ import os
6
+ class ClaudeLLM(LLM):
7
+
8
+ @property
9
+ def _llm_type(self) -> str:
10
+
11
+ return "custom"
12
+
13
+ def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
14
+
15
+
16
+ client = anthropic.Client(os.environ['ANTHROPIC_KEY'])
17
+
18
+
19
+ # How about the formatted prompt?
20
+ prompt_formatted = (
21
+ f"{anthropic.HUMAN_PROMPT}{prompt}\n{anthropic.AI_PROMPT}"
22
+ )
23
+
24
+
25
+ response = client.completion(
26
+ prompt=prompt_formatted,
27
+ stop_sequences=[anthropic.HUMAN_PROMPT],
28
+ model="claude-instant-v1-100k",
29
+ max_tokens_to_sample=100000,
30
+ temperature=0.3,
31
+ )
32
+
33
+ return response["completion"]
34
+
35
+ @property
36
+ def _identifying_params(self) -> Mapping[str, Any]:
37
+ """Get the identifying parameters."""
38
+ return {
39
+
40
+ }
41
+
42
+ def remove_numbers(question):
43
+ return question.translate(str.maketrans('', '', '0123456789'))
44
+
45
+ def extract_website_name(url):
46
+ parsed_url = urlparse(url)
47
+ if parsed_url.netloc.startswith("www."):
48
+ return parsed_url.netloc.split("www.")[1].split(".")[0]
49
+ return parsed_url.netloc.split(".")[0]