Spaces:
Sleeping
Sleeping
Islam YAHIAOUI
commited on
Commit
•
1e4288a
1
Parent(s):
59e1c4f
Update space
Browse files- Helpers.py +173 -0
- Processing.py +42 -0
- QdrantU.py +61 -0
- __pycache__/Helpers.cpython-312.pyc +0 -0
- __pycache__/Processing.cpython-312.pyc +0 -0
- __pycache__/QdrantU.cpython-312.pyc +0 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/rag.cpython-312.pyc +0 -0
- rag.py +30 -0
Helpers.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import spacy
|
4 |
+
import string
|
5 |
+
|
6 |
+
def generate_prompt(context, question, history):
|
7 |
+
|
8 |
+
# history_summary = ""
|
9 |
+
# if history:
|
10 |
+
# for user_query, bot_response in history[-3:]:
|
11 |
+
# history_summary += f"User: {user_query}\n Assistant: {bot_response}\n"
|
12 |
+
if context:
|
13 |
+
prompt_context = context
|
14 |
+
else:
|
15 |
+
prompt_context = "No context provided."
|
16 |
+
prompt = f"""
|
17 |
+
<s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.<</SYS>>
|
18 |
+
|
19 |
+
Context:
|
20 |
+
{prompt_context}
|
21 |
+
|
22 |
+
[INST] {question} [/INST]
|
23 |
+
|
24 |
+
Response:
|
25 |
+
"""
|
26 |
+
|
27 |
+
return prompt
|
28 |
+
|
29 |
+
# ==============================================================================================================================================
|
30 |
+
def llama(prompt):
|
31 |
+
url = "https://api.edenai.run/v2/text/generation"
|
32 |
+
payload = {
|
33 |
+
"providers": "meta/llama2-13b-chat-v1",
|
34 |
+
"response_as_dict": True,
|
35 |
+
"attributes_as_list": False,
|
36 |
+
"show_original_response": False,
|
37 |
+
"temperature": 0,
|
38 |
+
"max_tokens": 256,
|
39 |
+
"text": prompt
|
40 |
+
}
|
41 |
+
headers = {
|
42 |
+
"accept": "application/json",
|
43 |
+
"content-type": "application/json",
|
44 |
+
"authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMmYzMDE3MTEtOTJmNy00ZDU3LTg4N2MtNjU2MmE5MTU5MWZhIiwidHlwZSI6ImFwaV90b2tlbiJ9.vWvooRwxmr-uY1c61V97uugyDGpXmZGjX8oCFWKCUeM"
|
45 |
+
}
|
46 |
+
|
47 |
+
response = requests.post(url, json=payload, headers=headers)
|
48 |
+
result = response.json()
|
49 |
+
return result['meta/llama2-13b-chat-v1']['generated_text']
|
50 |
+
# ==============================================================================================================================================
|
51 |
+
|
52 |
+
def question_answering(question):
|
53 |
+
"""
|
54 |
+
Sends a question answering request to the EdenAI API.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
question: The question to be answered.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
The answer provided by the LLM model (string),
|
61 |
+
or None if an error occurs.
|
62 |
+
"""
|
63 |
+
headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMmYzMDE3MTEtOTJmNy00ZDU3LTg4N2MtNjU2MmE5MTU5MWZhIiwidHlwZSI6ImFwaV90b2tlbiJ9.vWvooRwxmr-uY1c61V97uugyDGpXmZGjX8oCFWKCUeM"}
|
64 |
+
|
65 |
+
url = "https://api.edenai.run/v2/text/question_answer"
|
66 |
+
payload = {
|
67 |
+
"providers": "openai",
|
68 |
+
"texts": [
|
69 |
+
"Linux is a family of open-source Unix-like operating systems based on the Linux kernel, an operating system kernel first released on September 17, 1991, by Linus Torvalds.",
|
70 |
+
"Just like Windows, iOS, and Mac OS, Linux is an operating system. "
|
71 |
+
],
|
72 |
+
'question': question,
|
73 |
+
"examples": [["What is human life expectancy in the United States?", "78 years."]],
|
74 |
+
"fallback_providers": ""
|
75 |
+
}
|
76 |
+
|
77 |
+
try:
|
78 |
+
response = requests.post(url, json=payload, headers=headers)
|
79 |
+
result = json.loads(response.text)
|
80 |
+
return result['openai']['answers'] if result['openai']['answers'] else None
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error communicating with LLM model: {e}")
|
83 |
+
return None
|
84 |
+
|
85 |
+
# ==============================================================================================================================================
|
86 |
+
def normalize_text(s):
|
87 |
+
"""Removing stopwords and punctuation, and standardizing whitespace are all typical text processing steps."""
|
88 |
+
|
89 |
+
nlp = spacy.load("en_core_web_sm")
|
90 |
+
def remove_stop(text):
|
91 |
+
return " ".join([word for word in text.split() if not nlp.vocab[word].is_stop])
|
92 |
+
|
93 |
+
def lemma(text):
|
94 |
+
return " ".join([word.lemma_ for word in nlp(text)])
|
95 |
+
|
96 |
+
def white_space_fix(text):
|
97 |
+
return " ".join(text.split()) # this function removes leading and trailing whitespaces and condenses all other whitespaces to a single space
|
98 |
+
|
99 |
+
def remove_punc(text):
|
100 |
+
exclude = set(string.punctuation)
|
101 |
+
return "".join(ch for ch in text if ch not in exclude)
|
102 |
+
|
103 |
+
def lower(text):
|
104 |
+
return text.lower()
|
105 |
+
|
106 |
+
return white_space_fix(lemma(remove_stop(remove_punc(lower(s)))))
|
107 |
+
|
108 |
+
# ==============================================================================================================================================
|
109 |
+
def get_relevance_docs(documents_score, threshold):
|
110 |
+
"""
|
111 |
+
Calculate relevance scores for the retrieved documents based on their relevance to the correct answer.
|
112 |
+
|
113 |
+
Parameters:
|
114 |
+
documents_score (list): List of scores for the retrieved documents.
|
115 |
+
threshold (float): Threshold value to determine relevance.
|
116 |
+
|
117 |
+
Returns:
|
118 |
+
list: List of relevance scores for the retrieved documents.
|
119 |
+
"""
|
120 |
+
relevance_scores = []
|
121 |
+
for score in documents_score:
|
122 |
+
if score >= threshold:
|
123 |
+
relevance_scores.append(1) # Relevant document
|
124 |
+
else:
|
125 |
+
relevance_scores.append(0) # Non-relevant document
|
126 |
+
return relevance_scores
|
127 |
+
|
128 |
+
# ==============================================================================================================================================
|
129 |
+
def get_docs_by_indices(docs, indices):
|
130 |
+
"""
|
131 |
+
Retrieve document contexts from a list of indexed documents based on provided indices.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
- docs (list): List of documents.
|
135 |
+
- indices (list): List of indices corresponding to the desired documents.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
- list: List of document contexts corresponding to the provided indices.
|
139 |
+
"""
|
140 |
+
return [docs[index] for index in indices]
|
141 |
+
|
142 |
+
def query_rewriter(query):
|
143 |
+
headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMTA5MTBlYTktOWYwOC00N2E2LTg3MDktOTlhODExZjkwZDA2IiwidHlwZSI6ImFwaV90b2tlbiJ9._wiFq518MhMRvG8waWbg_7Eogf50isgyzqh3e2ypvOU"}
|
144 |
+
|
145 |
+
url = "https://api.edenai.run/v2/text/code_generation"
|
146 |
+
payload = {
|
147 |
+
"providers": "openai",
|
148 |
+
# "instruction": "You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:Original Question: Which position did Knox Cunningham hold from May 1955 to Apr 1956? Stepback Question: Which positions have Knox Cunning- ham held in his career? , Now this Question: Who was the spouse of Anna Karina from 1968 to 1974?",
|
149 |
+
"prompt": "",
|
150 |
+
"model": "gpt-3.5-turbo",
|
151 |
+
"instruction": f"""You are an expert in document retrieval and search optimization.
|
152 |
+
Your task is to rewrite the following query to enhance its relevance and usefulness for retrieving information
|
153 |
+
from a database or search engine :
|
154 |
+
Original Query: {query} \n """,
|
155 |
+
"temperature": 0.6,
|
156 |
+
"max_tokens": 512,
|
157 |
+
"fallback_providers": " ['openai']"
|
158 |
+
|
159 |
+
}
|
160 |
+
|
161 |
+
response = requests.post(url, json=payload, headers=headers)
|
162 |
+
|
163 |
+
result = json.loads(response.text)
|
164 |
+
if(len(result['openai']['generated_text']) > len(query)):
|
165 |
+
try:
|
166 |
+
return split_text(result['openai']['generated_text'])
|
167 |
+
except:
|
168 |
+
return result['openai']['generated_text']
|
169 |
+
else:
|
170 |
+
return query
|
171 |
+
# ==============================================================================================================================================
|
172 |
+
def split_text(text):
|
173 |
+
return text.split(":")[1].strip() if ":" in text else text
|
Processing.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer, BertModel
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class TextEmbedder:
|
6 |
+
def __init__(self):
|
7 |
+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
8 |
+
self.model = BertModel.from_pretrained('bert-base-uncased')
|
9 |
+
|
10 |
+
def _mean_pooling(self, model_output, attention_mask):
|
11 |
+
token_embeddings = model_output.last_hidden_state
|
12 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
13 |
+
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
|
14 |
+
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
15 |
+
return sum_embeddings / sum_mask
|
16 |
+
|
17 |
+
def embed_text(self, examples):
|
18 |
+
inputs = self.tokenizer(
|
19 |
+
examples["content"], padding=True, truncation=True, return_tensors="pt"
|
20 |
+
)
|
21 |
+
with torch.no_grad():
|
22 |
+
model_output = self.model(**inputs)
|
23 |
+
pooled_embeds = self._mean_pooling(model_output, inputs["attention_mask"])
|
24 |
+
return {"embedding": pooled_embeds.cpu().numpy()}
|
25 |
+
|
26 |
+
def generate_embeddings(self, dataset):
|
27 |
+
return dataset.map(self.embed_text, batched=True, batch_size=128)
|
28 |
+
|
29 |
+
def embed_query(self, query_text):
|
30 |
+
query_inputs = self.tokenizer(
|
31 |
+
query_text,
|
32 |
+
padding=True,
|
33 |
+
truncation=True,
|
34 |
+
return_tensors="pt"
|
35 |
+
)
|
36 |
+
|
37 |
+
with torch.no_grad():
|
38 |
+
query_model_output = self.model(**query_inputs)
|
39 |
+
|
40 |
+
query_embedding = self._mean_pooling(query_model_output, query_inputs["attention_mask"])
|
41 |
+
|
42 |
+
return query_embedding
|
QdrantU.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from qdrant_client.http import models
|
3 |
+
from qdrant_client import QdrantClient
|
4 |
+
|
5 |
+
|
6 |
+
class QdrantU:
|
7 |
+
def __init__(self, collection_name):
|
8 |
+
self.client = QdrantClient(
|
9 |
+
url="https://5c32ac64-b1f7-4665-91eb-e321a98c02f6.europe-west3-0.gcp.cloud.qdrant.io:6333",
|
10 |
+
api_key="Wd_RTregmznFMCyDLagJHM_7a5TjJJuFLVTuMgfjQD44-BHLnhYbUg",
|
11 |
+
)
|
12 |
+
self.collection_name = collection_name
|
13 |
+
|
14 |
+
def _upload_documents_to_Qdrant(self, data, source):
|
15 |
+
points = []
|
16 |
+
for title, content, publishdate, embedding in zip(data["title"], data["content"], data["publishdate"], data["embedding"]):
|
17 |
+
new_id = str(uuid.uuid4()) # Generate a new UUID for each document
|
18 |
+
point = models.PointStruct(
|
19 |
+
id=new_id,
|
20 |
+
vector=embedding,
|
21 |
+
payload={
|
22 |
+
"title": title,
|
23 |
+
"content": content,
|
24 |
+
"publishdate": publishdate,
|
25 |
+
"source" : source
|
26 |
+
}
|
27 |
+
)
|
28 |
+
points.append(point)
|
29 |
+
|
30 |
+
self.client.upsert(
|
31 |
+
collection_name=self.collection_name,
|
32 |
+
points=points
|
33 |
+
)
|
34 |
+
|
35 |
+
print("Uploaded:", len(data["embedding"]), "documents to the Qdrant database")
|
36 |
+
|
37 |
+
|
38 |
+
def upload_to_Qdrant(self, data, batch_size=35, source=''):
|
39 |
+
for i in range(0, len(data), batch_size):
|
40 |
+
batch = data[i:i + batch_size]
|
41 |
+
self._upload_documents_to_Qdrant(batch , source)
|
42 |
+
print(f"Uploaded {i + len(batch)} documents")
|
43 |
+
|
44 |
+
|
45 |
+
def get_number_of_vectors(self):
|
46 |
+
collection_info = self.client.get_collection(self.collection_name)
|
47 |
+
num_vectors = collection_info.points_count
|
48 |
+
return num_vectors
|
49 |
+
|
50 |
+
def close_connection(self):
|
51 |
+
self.client.close()
|
52 |
+
|
53 |
+
def search(self, query, text_embedder, limit):
|
54 |
+
query_vector = text_embedder.embed_query(query_text=query)
|
55 |
+
query_result = self.client.search(
|
56 |
+
collection_name=self.collection_name,
|
57 |
+
query_vector=query_vector[0].tolist(), # Convert tensor to list
|
58 |
+
limit=limit,
|
59 |
+
with_payload=True
|
60 |
+
)
|
61 |
+
return query_result
|
__pycache__/Helpers.cpython-312.pyc
ADDED
Binary file (7.82 kB). View file
|
|
__pycache__/Processing.cpython-312.pyc
ADDED
Binary file (3.02 kB). View file
|
|
__pycache__/QdrantU.cpython-312.pyc
ADDED
Binary file (3.37 kB). View file
|
|
__pycache__/app.cpython-312.pyc
ADDED
Binary file (1.99 kB). View file
|
|
__pycache__/rag.cpython-312.pyc
ADDED
Binary file (1.72 kB). View file
|
|
rag.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from QdrantU import QdrantU
|
2 |
+
from Processing import TextEmbedder
|
3 |
+
import cohere
|
4 |
+
from Helpers import generate_prompt, llama, get_docs_by_indices , query_rewriter
|
5 |
+
|
6 |
+
|
7 |
+
def run_rag(query, history=None):
|
8 |
+
embedding_model = TextEmbedder()
|
9 |
+
uploader = QdrantU(collection_name='News_source')
|
10 |
+
try:
|
11 |
+
query = query_rewriter(query)
|
12 |
+
print("Query after rewriting: ", query)
|
13 |
+
except:
|
14 |
+
print("Error in query rewriting")
|
15 |
+
pass
|
16 |
+
search_results = uploader.search(query, embedding_model, limit=1000)
|
17 |
+
docs = list(set([result.payload['content'] for result in search_results]))
|
18 |
+
|
19 |
+
apiKey = 'Q21IIAUkTtt1jk9WUgJg0XiCvaU2K73cFbq0djhM' # API key for Cohere
|
20 |
+
co = cohere.Client(apiKey)
|
21 |
+
rerank_docs = co.rerank(
|
22 |
+
query=query, documents=docs, top_n=2, model="rerank-english-v3.0"
|
23 |
+
)
|
24 |
+
|
25 |
+
indices = [result.index for result in rerank_docs.results]
|
26 |
+
documents = get_docs_by_indices(docs, indices)
|
27 |
+
prompt = generate_prompt(documents, query, history)
|
28 |
+
print("Prompt: ", prompt)
|
29 |
+
# response = llama(prompt)
|
30 |
+
return prompt
|