E-slam commited on
Commit
ee3f9f8
·
verified ·
1 Parent(s): 1c3aea0

Upload Allam_Backend_HF.py

Browse files
Files changed (1) hide show
  1. Allam_Backend_HF.py +267 -0
Allam_Backend_HF.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import faiss
3
+ import numpy as np
4
+ import torch
5
+ import requests
6
+ import os
7
+ #import huggingface_hub
8
+ hf_token = os.getenv("hf_token")
9
+ #huggingface_hub.login(hf_token)
10
+
11
+ df = pd.read_excel("Allam_SA_Articles.xlsx")
12
+ input_texts = df['Article_text'].tolist()
13
+ MOJ_embeddings = np.load('Allam_embeddings.npy')
14
+
15
+
16
+ def embed_single_text(query):
17
+ headers = {
18
+ "Authorization": f"Bearer {hf_token}"
19
+ }
20
+
21
+ url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"
22
+
23
+ response = requests.get(url, headers=headers)
24
+
25
+ if response.status_code == 200:
26
+ return torch.tensor(response.json())
27
+ else:
28
+ print(f"Error: {response.status_code}")
29
+ return None
30
+
31
+
32
+ #Faiss
33
+ dimension = MOJ_embeddings.shape[1]
34
+ index = faiss.IndexFlatIP(dimension)
35
+ index.add(MOJ_embeddings)
36
+
37
+ def query_search(query, K):
38
+ query_embedding = embed_single_text(query)
39
+ distances, indices = index.search(query_embedding, K)
40
+
41
+ results = []
42
+ for idx in indices[0]:
43
+ file_id = df.iloc[idx]['File_ID']
44
+ row_number = df.iloc[idx]['Row_Number']
45
+ #results.append((file_id, row_number))
46
+ results.append(idx)
47
+ return results
48
+
49
+ from sklearn.feature_extraction.text import TfidfVectorizer
50
+ from sklearn.metrics.pairwise import cosine_similarity
51
+
52
+ def return_top5_chunks(query):
53
+ matching_indices = query_search(query, 15)
54
+ relevant_rows = df.iloc[matching_indices]
55
+
56
+ def chunk_text(text, max_words=150):
57
+ words = text.split()
58
+ return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
59
+
60
+ relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)
61
+
62
+ chunked_texts = []
63
+ for idx, row in relevant_rows.iterrows():
64
+ for chunk in row['Chunks']:
65
+ chunked_texts.append((chunk, idx))
66
+
67
+ def find_top_k_similar(texts, query, k):
68
+ documents = [text for text, _ in texts]
69
+
70
+ vectorizer = TfidfVectorizer()
71
+
72
+ all_texts = documents + [query]
73
+
74
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
75
+
76
+ similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
77
+
78
+ top_k_indices = similarities.argsort()[-k:][::-1]
79
+ return [(texts[i], similarities[i]) for i in top_k_indices]
80
+
81
+ top_5_chunks = find_top_k_similar(chunked_texts, query, 5)
82
+
83
+ chunks_txt = ''
84
+ for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
85
+ chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"
86
+
87
+ if i < len(top_5_chunks) - 1:
88
+ chunks_txt += "##########\n"
89
+
90
+ return chunks_txt
91
+
92
+
93
+ import requests
94
+
95
+
96
+ api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'
97
+
98
+ url = "https://iam.cloud.ibm.com/identity/token"
99
+
100
+ headers = {
101
+ "Content-Type": "application/x-www-form-urlencoded"
102
+ }
103
+
104
+ data = {
105
+ "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
106
+ "apikey": api_key
107
+ }
108
+
109
+ response = requests.post(url, headers=headers, data=data)
110
+ token_info = response.json()
111
+ access_token = token_info['access_token']
112
+
113
+
114
+
115
+
116
+ def allam_response(context, query):
117
+ url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
118
+
119
+ input_text_base = f"""
120
+ [Context]: {context}
121
+ [System]:
122
+ You are an Arabic frindley chatbot named مستنير.
123
+ You will be provided with an Arabic context ,
124
+ Your task is to extract and Answer for the questions only from the context provided
125
+ elaborate on the answer from the context
126
+ At the end of your response mention the Article : مادة
127
+ if no answer is found apologize
128
+
129
+ Question: {query}
130
+ """
131
+ body = {
132
+ "input": input_text_base,
133
+ "parameters": {
134
+ "decoding_method": "greedy",
135
+ "max_new_tokens": 900,
136
+ "min_new_tokens": 0,
137
+ "stop_sequences": [],
138
+ "repetition_penalty": 1
139
+ },
140
+ "model_id": "sdaia/allam-1-13b-instruct",
141
+ "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
142
+ }
143
+
144
+ headers = {
145
+ "Accept": "application/json",
146
+ "Content-Type": "application/json",
147
+ "Authorization": f"Bearer {access_token}"
148
+ }
149
+
150
+ response = requests.post(url, headers=headers, json=body)
151
+
152
+ if response.status_code != 200:
153
+ raise Exception("Non-200 response: " + str(response.text))
154
+
155
+ response = response.json()
156
+
157
+ return response['results'][0]['generated_text']
158
+
159
+
160
+
161
+ import json
162
+
163
+ import re
164
+
165
+ def index_num(text):
166
+
167
+ match = re.search(r'"Index":\s*"(\d+)"', text)
168
+ index_number = match.group(1) if match else None
169
+
170
+ return int(index_number)
171
+
172
+ def get_top_matching_chunk(text, query, max_words=500):
173
+ def chunk_text(text, max_words):
174
+ words = text.split()
175
+ return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
176
+
177
+ chunks = chunk_text(text, max_words)
178
+
179
+ vectorizer = TfidfVectorizer()
180
+ all_texts = chunks + [query]
181
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
182
+
183
+ similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
184
+
185
+ top_chunk_index = similarities.argmax()
186
+
187
+ return chunks[top_chunk_index]
188
+
189
+ def reformat_indentation(text, indent_spaces=4):
190
+ indent = ' ' * indent_spaces
191
+
192
+ lines = text.splitlines()
193
+
194
+ formatted_lines = [indent + line.strip() for line in lines]
195
+
196
+ return '\n'.join(formatted_lines)
197
+
198
+ def return_index_num(data_text, query):
199
+
200
+ url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
201
+
202
+ sys_prompt = """
203
+ Identify the **first** Index chunk with the answer to a given question.
204
+ Chunks are seperated by ##########
205
+ Respond only with **Json** format **do not return any words**:
206
+
207
+ {"Index": "extracted_Index"}
208
+
209
+ Or:
210
+
211
+ {"Index": "not_found"}
212
+
213
+ **No additional text allowed**.
214
+
215
+ """
216
+ sys_prompt += f"Question : {query}"
217
+
218
+ input_text = f"""
219
+ [Context]: {data_text.strip()}
220
+ [System]: {sys_prompt.strip()}
221
+ """
222
+
223
+ input_text = reformat_indentation(input_text, indent_spaces=0)
224
+ body = {
225
+ "input": input_text,
226
+ "parameters": {
227
+ "decoding_method": "greedy",
228
+ "max_new_tokens": 20,
229
+ "repetition_penalty": 1
230
+ },
231
+ "model_id": "sdaia/allam-1-13b-instruct",
232
+ "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
233
+ }
234
+
235
+ headers = {
236
+ "Accept": "application/json",
237
+ "Content-Type": "application/json",
238
+ "Authorization": f"Bearer {access_token}" # access_token must be defined elsewhere
239
+ }
240
+
241
+
242
+ response = requests.post(url, headers=headers, json=body)
243
+
244
+ if response.status_code != 200:
245
+ raise Exception("Non-200 response: " + str(response.text))
246
+
247
+ response = response.json()
248
+
249
+ return(response['results'][0]['generated_text'])
250
+
251
+
252
+
253
+ def allam_llm(q):
254
+
255
+ chunks_text = return_top5_chunks(q)
256
+
257
+ targeted_chunk = return_index_num(chunks_text, q)
258
+
259
+ index_number = index_num(targeted_chunk)
260
+
261
+ text_to_chunk = df['Article_text'][index_number]
262
+
263
+ top_chunk = get_top_matching_chunk(text_to_chunk, q)
264
+
265
+ allam_res = allam_response(top_chunk, q)
266
+
267
+ return allam_res