Islam YAHIAOUI commited on
Commit
1e4288a
1 Parent(s): 59e1c4f

Update space

Browse files
Helpers.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import spacy
4
+ import string
5
+
6
+ def generate_prompt(context, question, history):
7
+
8
+ # history_summary = ""
9
+ # if history:
10
+ # for user_query, bot_response in history[-3:]:
11
+ # history_summary += f"User: {user_query}\n Assistant: {bot_response}\n"
12
+ if context:
13
+ prompt_context = context
14
+ else:
15
+ prompt_context = "No context provided."
16
+ prompt = f"""
17
+ <s>[INST] <<SYS>> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible based on the context, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.<</SYS>>
18
+
19
+ Context:
20
+ {prompt_context}
21
+
22
+ [INST] {question} [/INST]
23
+
24
+ Response:
25
+ """
26
+
27
+ return prompt
28
+
29
+ # ==============================================================================================================================================
30
+ def llama(prompt):
31
+ url = "https://api.edenai.run/v2/text/generation"
32
+ payload = {
33
+ "providers": "meta/llama2-13b-chat-v1",
34
+ "response_as_dict": True,
35
+ "attributes_as_list": False,
36
+ "show_original_response": False,
37
+ "temperature": 0,
38
+ "max_tokens": 256,
39
+ "text": prompt
40
+ }
41
+ headers = {
42
+ "accept": "application/json",
43
+ "content-type": "application/json",
44
+ "authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMmYzMDE3MTEtOTJmNy00ZDU3LTg4N2MtNjU2MmE5MTU5MWZhIiwidHlwZSI6ImFwaV90b2tlbiJ9.vWvooRwxmr-uY1c61V97uugyDGpXmZGjX8oCFWKCUeM"
45
+ }
46
+
47
+ response = requests.post(url, json=payload, headers=headers)
48
+ result = response.json()
49
+ return result['meta/llama2-13b-chat-v1']['generated_text']
50
+ # ==============================================================================================================================================
51
+
52
+ def question_answering(question):
53
+ """
54
+ Sends a question answering request to the EdenAI API.
55
+
56
+ Args:
57
+ question: The question to be answered.
58
+
59
+ Returns:
60
+ The answer provided by the LLM model (string),
61
+ or None if an error occurs.
62
+ """
63
+ headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMmYzMDE3MTEtOTJmNy00ZDU3LTg4N2MtNjU2MmE5MTU5MWZhIiwidHlwZSI6ImFwaV90b2tlbiJ9.vWvooRwxmr-uY1c61V97uugyDGpXmZGjX8oCFWKCUeM"}
64
+
65
+ url = "https://api.edenai.run/v2/text/question_answer"
66
+ payload = {
67
+ "providers": "openai",
68
+ "texts": [
69
+ "Linux is a family of open-source Unix-like operating systems based on the Linux kernel, an operating system kernel first released on September 17, 1991, by Linus Torvalds.",
70
+ "Just like Windows, iOS, and Mac OS, Linux is an operating system. "
71
+ ],
72
+ 'question': question,
73
+ "examples": [["What is human life expectancy in the United States?", "78 years."]],
74
+ "fallback_providers": ""
75
+ }
76
+
77
+ try:
78
+ response = requests.post(url, json=payload, headers=headers)
79
+ result = json.loads(response.text)
80
+ return result['openai']['answers'] if result['openai']['answers'] else None
81
+ except Exception as e:
82
+ print(f"Error communicating with LLM model: {e}")
83
+ return None
84
+
85
+ # ==============================================================================================================================================
86
+ def normalize_text(s):
87
+ """Removing stopwords and punctuation, and standardizing whitespace are all typical text processing steps."""
88
+
89
+ nlp = spacy.load("en_core_web_sm")
90
+ def remove_stop(text):
91
+ return " ".join([word for word in text.split() if not nlp.vocab[word].is_stop])
92
+
93
+ def lemma(text):
94
+ return " ".join([word.lemma_ for word in nlp(text)])
95
+
96
+ def white_space_fix(text):
97
+ return " ".join(text.split()) # this function removes leading and trailing whitespaces and condenses all other whitespaces to a single space
98
+
99
+ def remove_punc(text):
100
+ exclude = set(string.punctuation)
101
+ return "".join(ch for ch in text if ch not in exclude)
102
+
103
+ def lower(text):
104
+ return text.lower()
105
+
106
+ return white_space_fix(lemma(remove_stop(remove_punc(lower(s)))))
107
+
108
+ # ==============================================================================================================================================
109
+ def get_relevance_docs(documents_score, threshold):
110
+ """
111
+ Calculate relevance scores for the retrieved documents based on their relevance to the correct answer.
112
+
113
+ Parameters:
114
+ documents_score (list): List of scores for the retrieved documents.
115
+ threshold (float): Threshold value to determine relevance.
116
+
117
+ Returns:
118
+ list: List of relevance scores for the retrieved documents.
119
+ """
120
+ relevance_scores = []
121
+ for score in documents_score:
122
+ if score >= threshold:
123
+ relevance_scores.append(1) # Relevant document
124
+ else:
125
+ relevance_scores.append(0) # Non-relevant document
126
+ return relevance_scores
127
+
128
+ # ==============================================================================================================================================
129
+ def get_docs_by_indices(docs, indices):
130
+ """
131
+ Retrieve document contexts from a list of indexed documents based on provided indices.
132
+
133
+ Args:
134
+ - docs (list): List of documents.
135
+ - indices (list): List of indices corresponding to the desired documents.
136
+
137
+ Returns:
138
+ - list: List of document contexts corresponding to the provided indices.
139
+ """
140
+ return [docs[index] for index in indices]
141
+
142
+ def query_rewriter(query):
143
+ headers = {"Authorization": "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VyX2lkIjoiMTA5MTBlYTktOWYwOC00N2E2LTg3MDktOTlhODExZjkwZDA2IiwidHlwZSI6ImFwaV90b2tlbiJ9._wiFq518MhMRvG8waWbg_7Eogf50isgyzqh3e2ypvOU"}
144
+
145
+ url = "https://api.edenai.run/v2/text/code_generation"
146
+ payload = {
147
+ "providers": "openai",
148
+ # "instruction": "You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:Original Question: Which position did Knox Cunningham hold from May 1955 to Apr 1956? Stepback Question: Which positions have Knox Cunning- ham held in his career? , Now this Question: Who was the spouse of Anna Karina from 1968 to 1974?",
149
+ "prompt": "",
150
+ "model": "gpt-3.5-turbo",
151
+ "instruction": f"""You are an expert in document retrieval and search optimization.
152
+ Your task is to rewrite the following query to enhance its relevance and usefulness for retrieving information
153
+ from a database or search engine :
154
+ Original Query: {query} \n """,
155
+ "temperature": 0.6,
156
+ "max_tokens": 512,
157
+ "fallback_providers": " ['openai']"
158
+
159
+ }
160
+
161
+ response = requests.post(url, json=payload, headers=headers)
162
+
163
+ result = json.loads(response.text)
164
+ if(len(result['openai']['generated_text']) > len(query)):
165
+ try:
166
+ return split_text(result['openai']['generated_text'])
167
+ except:
168
+ return result['openai']['generated_text']
169
+ else:
170
+ return query
171
+ # ==============================================================================================================================================
172
+ def split_text(text):
173
+ return text.split(":")[1].strip() if ":" in text else text
Processing.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertModel
2
+ import torch
3
+
4
+
5
+ class TextEmbedder:
6
+ def __init__(self):
7
+ self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
8
+ self.model = BertModel.from_pretrained('bert-base-uncased')
9
+
10
+ def _mean_pooling(self, model_output, attention_mask):
11
+ token_embeddings = model_output.last_hidden_state
12
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
13
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
14
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
15
+ return sum_embeddings / sum_mask
16
+
17
+ def embed_text(self, examples):
18
+ inputs = self.tokenizer(
19
+ examples["content"], padding=True, truncation=True, return_tensors="pt"
20
+ )
21
+ with torch.no_grad():
22
+ model_output = self.model(**inputs)
23
+ pooled_embeds = self._mean_pooling(model_output, inputs["attention_mask"])
24
+ return {"embedding": pooled_embeds.cpu().numpy()}
25
+
26
+ def generate_embeddings(self, dataset):
27
+ return dataset.map(self.embed_text, batched=True, batch_size=128)
28
+
29
+ def embed_query(self, query_text):
30
+ query_inputs = self.tokenizer(
31
+ query_text,
32
+ padding=True,
33
+ truncation=True,
34
+ return_tensors="pt"
35
+ )
36
+
37
+ with torch.no_grad():
38
+ query_model_output = self.model(**query_inputs)
39
+
40
+ query_embedding = self._mean_pooling(query_model_output, query_inputs["attention_mask"])
41
+
42
+ return query_embedding
QdrantU.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from qdrant_client.http import models
3
+ from qdrant_client import QdrantClient
4
+
5
+
6
+ class QdrantU:
7
+ def __init__(self, collection_name):
8
+ self.client = QdrantClient(
9
+ url="https://5c32ac64-b1f7-4665-91eb-e321a98c02f6.europe-west3-0.gcp.cloud.qdrant.io:6333",
10
+ api_key="Wd_RTregmznFMCyDLagJHM_7a5TjJJuFLVTuMgfjQD44-BHLnhYbUg",
11
+ )
12
+ self.collection_name = collection_name
13
+
14
+ def _upload_documents_to_Qdrant(self, data, source):
15
+ points = []
16
+ for title, content, publishdate, embedding in zip(data["title"], data["content"], data["publishdate"], data["embedding"]):
17
+ new_id = str(uuid.uuid4()) # Generate a new UUID for each document
18
+ point = models.PointStruct(
19
+ id=new_id,
20
+ vector=embedding,
21
+ payload={
22
+ "title": title,
23
+ "content": content,
24
+ "publishdate": publishdate,
25
+ "source" : source
26
+ }
27
+ )
28
+ points.append(point)
29
+
30
+ self.client.upsert(
31
+ collection_name=self.collection_name,
32
+ points=points
33
+ )
34
+
35
+ print("Uploaded:", len(data["embedding"]), "documents to the Qdrant database")
36
+
37
+
38
+ def upload_to_Qdrant(self, data, batch_size=35, source=''):
39
+ for i in range(0, len(data), batch_size):
40
+ batch = data[i:i + batch_size]
41
+ self._upload_documents_to_Qdrant(batch , source)
42
+ print(f"Uploaded {i + len(batch)} documents")
43
+
44
+
45
+ def get_number_of_vectors(self):
46
+ collection_info = self.client.get_collection(self.collection_name)
47
+ num_vectors = collection_info.points_count
48
+ return num_vectors
49
+
50
+ def close_connection(self):
51
+ self.client.close()
52
+
53
+ def search(self, query, text_embedder, limit):
54
+ query_vector = text_embedder.embed_query(query_text=query)
55
+ query_result = self.client.search(
56
+ collection_name=self.collection_name,
57
+ query_vector=query_vector[0].tolist(), # Convert tensor to list
58
+ limit=limit,
59
+ with_payload=True
60
+ )
61
+ return query_result
__pycache__/Helpers.cpython-312.pyc ADDED
Binary file (7.82 kB). View file
 
__pycache__/Processing.cpython-312.pyc ADDED
Binary file (3.02 kB). View file
 
__pycache__/QdrantU.cpython-312.pyc ADDED
Binary file (3.37 kB). View file
 
__pycache__/app.cpython-312.pyc ADDED
Binary file (1.99 kB). View file
 
__pycache__/rag.cpython-312.pyc ADDED
Binary file (1.72 kB). View file
 
rag.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from QdrantU import QdrantU
2
+ from Processing import TextEmbedder
3
+ import cohere
4
+ from Helpers import generate_prompt, llama, get_docs_by_indices , query_rewriter
5
+
6
+
7
+ def run_rag(query, history=None):
8
+ embedding_model = TextEmbedder()
9
+ uploader = QdrantU(collection_name='News_source')
10
+ try:
11
+ query = query_rewriter(query)
12
+ print("Query after rewriting: ", query)
13
+ except:
14
+ print("Error in query rewriting")
15
+ pass
16
+ search_results = uploader.search(query, embedding_model, limit=1000)
17
+ docs = list(set([result.payload['content'] for result in search_results]))
18
+
19
+ apiKey = 'Q21IIAUkTtt1jk9WUgJg0XiCvaU2K73cFbq0djhM' # API key for Cohere
20
+ co = cohere.Client(apiKey)
21
+ rerank_docs = co.rerank(
22
+ query=query, documents=docs, top_n=2, model="rerank-english-v3.0"
23
+ )
24
+
25
+ indices = [result.index for result in rerank_docs.results]
26
+ documents = get_docs_by_indices(docs, indices)
27
+ prompt = generate_prompt(documents, query, history)
28
+ print("Prompt: ", prompt)
29
+ # response = llama(prompt)
30
+ return prompt