ZillionParts-PDF2Doc

Running

App Files Files Community

E-slam commited on Sep 20, 2024

Commit

ee3f9f8

verified ·

1 Parent(s): 1c3aea0

Upload Allam_Backend_HF.py

Browse files

Files changed (1) hide show

Allam_Backend_HF.py +267 -0

Allam_Backend_HF.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import pandas as pd
+import faiss
+import numpy as np
+import torch
+import requests
+import os
+#import huggingface_hub
+hf_token = os.getenv("hf_token")
+#huggingface_hub.login(hf_token)
+df = pd.read_excel("Allam_SA_Articles.xlsx")
+input_texts = df['Article_text'].tolist()
+MOJ_embeddings = np.load('Allam_embeddings.npy')
+def embed_single_text(query):
+    headers = {
+        "Authorization": f"Bearer {hf_token}"
+    }
+    url = f"https://allam-llm-e5-embeddings.hf.space/e5_embeddings?query={query}"
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        return torch.tensor(response.json())
+    else:
+        print(f"Error: {response.status_code}")
+        return None
+#Faiss
+dimension = MOJ_embeddings.shape[1]
+index = faiss.IndexFlatIP(dimension)
+index.add(MOJ_embeddings)
+def query_search(query, K):
+    query_embedding = embed_single_text(query)
+    distances, indices = index.search(query_embedding, K)
+    results = []
+    for idx in indices[0]:
+        file_id = df.iloc[idx]['File_ID']
+        row_number = df.iloc[idx]['Row_Number']
+        #results.append((file_id, row_number))
+        results.append(idx)
+    return results
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+def return_top5_chunks(query):
+  matching_indices = query_search(query, 15)
+  relevant_rows = df.iloc[matching_indices]
+  def chunk_text(text, max_words=150):
+      words = text.split()
+      return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
+  relevant_rows['Chunks'] = relevant_rows['Article_text'].apply(chunk_text)
+  chunked_texts = []
+  for idx, row in relevant_rows.iterrows():
+      for chunk in row['Chunks']:
+          chunked_texts.append((chunk, idx))
+  def find_top_k_similar(texts, query, k):
+      documents = [text for text, _ in texts]
+      vectorizer = TfidfVectorizer()
+      all_texts = documents + [query]
+      tfidf_matrix = vectorizer.fit_transform(all_texts)
+      similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
+      top_k_indices = similarities.argsort()[-k:][::-1]
+      return [(texts[i], similarities[i]) for i in top_k_indices]
+  top_5_chunks = find_top_k_similar(chunked_texts, query, 5)
+  chunks_txt = ''
+  for i, ((chunk, idx), similarity) in enumerate(top_5_chunks):
+      chunks_txt += f"Index: {idx},\nChunk: {chunk}\n"
+      if i < len(top_5_chunks) - 1:
+          chunks_txt += "##########\n"
+  return chunks_txt
+import requests
+api_key = 'UEGtyhQpPCKfhsQ_rPlBbEsgZErSh8xPU57qm9DQ-ZkC'
+url = "https://iam.cloud.ibm.com/identity/token"
+headers = {
+    "Content-Type": "application/x-www-form-urlencoded"
+}
+data = {
+    "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
+    "apikey": api_key
+}
+response = requests.post(url, headers=headers, data=data)
+token_info = response.json()
+access_token = token_info['access_token']
+def allam_response(context, query):
+    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
+    input_text_base = f"""
+    [Context]: {context}
+    [System]:
+    You are an Arabic frindley chatbot named مستنير.
+    You will be provided with an Arabic context ,
+    Your task is to extract and Answer for the questions only from the context provided
+    elaborate on the answer from the context
+    At the end of your response mention the Article : مادة
+    if no answer is found apologize
+    Question: {query}
+    """
+    body = {
+        "input": input_text_base,
+        "parameters": {
+          "decoding_method": "greedy",
+          "max_new_tokens": 900,
+          "min_new_tokens": 0,
+          "stop_sequences": [],
+          "repetition_penalty": 1
+        },
+        "model_id": "sdaia/allam-1-13b-instruct",
+        "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
+    }
+    headers = {
+        "Accept": "application/json",
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {access_token}"
+    }
+    response = requests.post(url, headers=headers, json=body)
+    if response.status_code != 200:
+        raise Exception("Non-200 response: " + str(response.text))
+    response = response.json()
+    return response['results'][0]['generated_text']
+import json
+import re
+def index_num(text):
+    match = re.search(r'"Index":\s*"(\d+)"', text)
+    index_number = match.group(1) if match else None
+    return int(index_number)
+def get_top_matching_chunk(text, query, max_words=500):
+    def chunk_text(text, max_words):
+        words = text.split()
+        return [' '.join(words[i:i+max_words]) for i in range(0, len(words), max_words)]
+    chunks = chunk_text(text, max_words)
+    vectorizer = TfidfVectorizer()
+    all_texts = chunks + [query]
+    tfidf_matrix = vectorizer.fit_transform(all_texts)
+    similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
+    top_chunk_index = similarities.argmax()
+    return chunks[top_chunk_index]
+def reformat_indentation(text, indent_spaces=4):
+    indent = ' ' * indent_spaces
+    lines = text.splitlines()
+    formatted_lines = [indent + line.strip() for line in lines]
+    return '\n'.join(formatted_lines)
+def return_index_num(data_text, query):
+    url = "https://eu-de.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
+    sys_prompt = """
+    Identify the **first** Index chunk with the answer to a given question.
+    Chunks are seperated by ##########
+    Respond only with **Json** format **do not return any words**:
+    {"Index": "extracted_Index"}
+    Or:
+    {"Index": "not_found"}
+    **No additional text allowed**.
+    """
+    sys_prompt += f"Question : {query}"
+    input_text = f"""
+    [Context]: {data_text.strip()}
+    [System]: {sys_prompt.strip()}
+    """
+    input_text = reformat_indentation(input_text, indent_spaces=0)
+    body = {
+      "input": input_text,
+      "parameters": {
+          "decoding_method": "greedy",
+          "max_new_tokens": 20,
+          "repetition_penalty": 1
+      },
+      "model_id": "sdaia/allam-1-13b-instruct",
+      "project_id": "72a4dcd4-e6e9-4cdc-9c7e-1a0ef1483936"
+    }
+    headers = {
+      "Accept": "application/json",
+      "Content-Type": "application/json",
+      "Authorization": f"Bearer {access_token}"  # access_token must be defined elsewhere
+    }
+    response = requests.post(url, headers=headers, json=body)
+    if response.status_code != 200:
+      raise Exception("Non-200 response: " + str(response.text))
+    response = response.json()
+    return(response['results'][0]['generated_text'])
+def allam_llm(q):
+    chunks_text = return_top5_chunks(q)
+    targeted_chunk = return_index_num(chunks_text, q)
+    index_number = index_num(targeted_chunk)
+    text_to_chunk = df['Article_text'][index_number]
+    top_chunk = get_top_matching_chunk(text_to_chunk, q)
+    allam_res = allam_response(top_chunk, q)
+    return allam_res