Spaces:

parsi-ai-nlpclass
/

Legal_RAG

Running

App Files Files Community

mojtabaa4 commited on Sep 26, 2024

Commit

bc68b0b

1 Parent(s): 3b47706

add application files

Browse files

Files changed (16) hide show

app.py +78 -0
config.py +34 -0
model/__init__.py +0 -0
model/chat.py +31 -0
model/controller.py +18 -0
model/llm/llm.py +117 -0
model/processor/case_crawler.py +113 -0
model/processor/database_Chunker.ipynb +0 -0
model/processor/law_provider.py +61 -0
model/processor/pre_process.ipynb +0 -0
model/processor/retrieval_rag_nlp_project.ipynb:Zone.Identifier +0 -0
model/propmt/__init__.py +0 -0
model/propmt/prompt_handler.py +16 -0
model/rag/__init__.py +0 -0
model/rag/rag_handler.py +89 -0
requirements.txt +26 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import gradio as gr
+from model.controller import Controller
+import zipfile
+os.chdir("/home/user/app")
+os.system('wget -O processed_cases.csv "https://drive.usercontent.google.com/download?id=1jMuQtywo0mbj7ZHCCsyE8xurbSyVVCst&export=download&confirm=t&uuid=2f681c98-86f8-4159-9e03-673cdcbc7cb51"')
+os.system('wget -O chromadb_collection.zip "https://drive.usercontent.google.com/download?id=1gz5-gxSlySEtPTzL_VPQ9e8jxHFuL0ZJ&export=download&confirm=t&uuid=de946efb-47b3-435d-b432-3bd5c01c73fb"')
+with zipfile.ZipFile("chromadb_collection.zip", 'r') as zip_ref:
+    zip_ref.extractall()
+os.system('mv content/chromadb_collections chromadb_collections')
+os.system('rm -r content')
+bot = Controller()
+def chatbot_interface(user_input, chat_id=2311):
+    return bot.handle_message(chat_id, user_input)
+def validate_input(user_input):
+    if not user_input or user_input.strip() == "":
+        return False, "🚫 Please enter a valid legal question. It cannot be empty."
+    if len(user_input) < 5:
+        return False, "⚠️ Your question is too short. Please provide more details."
+    return True, None
+custom_css = """
+@font-face {
+    font-family: 'Vazir';
+    src: url('https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font/vf/Vazir.woff2') format('woff2'),
+         url('https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font/vf/Vazir.woff') format('woff');
+}
+.gradio-container {
+    background-color: #f9f9f9;
+}
+.chatbox, .inputbox {
+    font-family: 'Vazir', sans-serif;
+    font-size: 16px;
+}
+"""
+with gr.Blocks(css=custom_css) as interface:
+    gr.Markdown("""
+    <div style="text-align: center; font-family: 'Vazir';">
+    <h1 style="color: #4a90e2;">⚖️ RAG Law Chatbot ⚖️</h1>
+    <p style="font-size: 18px; color: #333;">Welcome to the legal chatbot! 👨‍⚖️👩‍⚖️<br>Ask any legal question, and our assistant will help you! 📜🏛️</p>
+    </div>
+    """)
+    # Organize the chatbot area in a column for vertical stacking
+    with gr.Column():
+        chatbot = gr.Chatbot(label="🧑‍⚖️ Legal Chatbot Assistant 🧑‍⚖️", height=400, elem_classes=["chatbox"])
+    # Use Row to align input and button horizontally
+    with gr.Row():
+        user_input = gr.Textbox(show_label=False, placeholder="Enter your law question here... ⚖️", container=True)
+        send_button = gr.Button("📤 Send")
+    # Chat update function to append new messages to the chatbot
+    def chat_update(user_message, history):
+        history = history or []
+        is_valid, validation_message = validate_input(user_message)
+        if not is_valid:
+            history.append((user_message, validation_message))
+            return history, ""
+        bot_reply = chatbot_interface(user_message)
+        history.append((user_message, bot_reply))
+        return history, ""
+    # Connect the button click to the chat update function
+    send_button.click(chat_update, [user_input, chatbot], [chatbot, user_input])
+interface.launch()

config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+gpt_3_5 = "gpt-3.5-turbo-instruct"
+gpt_mini = "gpt-4o-mini"
+aval_ai = {
+    "model": gpt_3_5,
+    "base_url": "https://api.avalai.ir/v1",
+}
+GILAS_CONFIG = {
+    "api_key": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwMzg5OTQ0NjgsImp0aSI6IjExNDg4MzAyMTE3NDA0MzY2ODc0NiIsImlhdCI6MTcyMzYzNDQ2OCwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyMzYzNDQ2OCwic3ViIjoiMTE0ODgzMDIxMTc0MDQzNjY4NzQ2In0.8hbh59BmwBcAfoH9nEB98_5BIuxzwUUb8fpHSKF1S_Q",
+    "model": "gpt-4o-mini" ,
+    "base_url": 'https://api.gilas.io/v1',
+}
+GILAS_API_KEYS = [
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwMzg5OTQ0NjgsImp0aSI6IjExNDg4MzAyMTE3NDA0MzY2ODc0NiIsImlhdCI6MTcyMzYzNDQ2OCwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyMzYzNDQ2OCwic3ViIjoiMTE0ODgzMDIxMTc0MDQzNjY4NzQ2In0.8hbh59BmwBcAfoH9nEB98_5BIuxzwUUb8fpHSKF1S_Q",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzI3NTYsImp0aSI6IjEwNjg5OTE1MjQwNTM4MzY3Nzc2NyIsImlhdCI6MTcyNzE3Mjc1NiwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3Mjc1Niwic3ViIjoiMTA2ODk5MTUyNDA1MzgzNjc3NzY3In0.Jgfi7BWhpXFTYdHe73md5p932EP75wTD-CZQ6SfGkK8",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzMzNzIsImp0aSI6IjEwNjg4MTE2MzAzOTkzMTg2MjY3NiIsImlhdCI6MTcyNzE3MzM3MiwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3MzM3Miwic3ViIjoiMTA2ODgxMTYzMDM5OTMxODYyNjc2In0.PhVdoRUdaCfHa4va-EtWP5o7KISCSdMjT5mWtc9cefo",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzM0MDIsImp0aSI6IjExNTY3MDAwOTQyMjcyNTE3NDE1NCIsImlhdCI6MTcyNzE3MzQwMiwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3MzQwMiwic3ViIjoiMTE1NjcwMDA5NDIyNzI1MTc0MTU0In0.IRcnkiZJdKNPTE1nYXoeiVMfxj9xXHSvAxBLaBGC6yk",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzM1MzEsImp0aSI6IjExMzk2NzY4OTcxNjg2NjYzNDk3MCIsImlhdCI6MTcyNzE3MzUzMSwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3MzUzMSwic3ViIjoiMTEzOTY3Njg5NzE2ODY2NjM0OTcwIn0.kHZZDlVnZsbnoSac0wtM3ezrPCkIBYVQSdkfbFsT_xs",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzM1ODksImp0aSI6IjEwNzM3MDcyODA4NDQxMTk0MTQwOSIsImlhdCI6MTcyNzE3MzU4OSwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3MzU4OSwic3ViIjoiMTA3MzcwNzI4MDg0NDExOTQxNDA5In0.4qhnj6YhunOHoAMmosibf4CaopJqSlvwxvhB6671Suw",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzQ5ODEsImp0aSI6IjEwNjE2NTI5NzI5MjAxODExMzgwMCIsImlhdCI6MTcyNzE3NDk4MSwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3NDk4MSwic3ViIjoiMTA2MTY1Mjk3MjkyMDE4MTEzODAwIn0.9QvgxTlDugcDwSa880B0hefhWjVfEzjTDX2ywgNORrc",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzUwNTIsImp0aSI6IjExMzA3MTQ4ODA5OTA0OTQzMDI0MSIsImlhdCI6MTcyNzE3NTA1MiwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3NTA1Miwic3ViIjoiMTEzMDcxNDg4MDk5MDQ5NDMwMjQxIn0.Z8TNrz_LXCtFjE0BwBLCBqh03uTKZ6WWLptQA6zdy1Y",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1MzUxMjUsImp0aSI6IjExMTU3MzA2NjkwODIzNjk4MjM1OSIsImlhdCI6MTcyNzE3NTEyNSwiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE3NTEyNSwic3ViIjoiMTExNTczMDY2OTA4MjM2OTgyMzU5In0.eQIqXoSbsD19AJrQxCVh7T6tcLvCJ7TH3c8Ajso9CJU",
+    "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjIwNDI1NTM2NjcsImp0aSI6IjEwMTkyMDcyMjAwOTgxNDEwMDE5MiIsImlhdCI6MTcyNzE5MzY2NywiaXNzIjoiaHR0cHM6Ly9naWxhcy5pbyIsIm5iZiI6MTcyNzE5MzY2Nywic3ViIjoiMTAxOTIwNzIyMDA5ODE0MTAwMTkyIn0.WmYY-BbcsYcvgZmes_eH5AS-06imEDslcNPH41UOH-c",
+]
+OPENAI_CONFIG = {
+    "model": gpt_mini,
+}
+LLM_CONFIG = aval_ai

model/__init__.py ADDED Viewed

File without changes

model/chat.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from model.propmt.prompt_handler import *
+from model.llm.llm import *
+from model.rag.rag_handler import *
+from config import *
+class Chat:
+    def __init__(self, chat_id, rag_handler) -> None:
+        self.chat_id = chat_id
+        self.message_history = []
+        self.response_history = []
+        self.prompt_handler = Prompt()
+        self.llm = LLM_API_Call("gilas")
+        self.rag_handler = rag_handler
+    def response(self, message: str) -> str:
+        self.message_history.append(message)
+        info_list = self.rag_handler.get_information(message)
+        prompt = self.prompt_handler.get_prompt(message, info_list)
+        llm_response = self.llm.get_LLM_response(prompt=prompt)
+        final_response = f"**Response**:\n{llm_response}\n\n"
+        if info_list:
+            final_response += "The following legal cases and information were retrieved and considered:\n"
+            for i, info in enumerate(info_list):
+                case_text = info['text'].replace("[end]", "")
+                final_response += f"\n**Case {i+1}:** {info['title']}\n{case_text}\n"
+        self.response_history.append(final_response)
+        return final_response

model/controller.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from model.chat import *
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+class Controller:
+    def __init__(self) -> None:
+        self.chat_dic = {}
+        self.rag_handler = RAG()
+    def handle_message(self,
+                       chat_id: int,
+                       message: str) -> str:
+        if chat_id not in self.chat_dic:
+             self.chat_dic[chat_id] = Chat(chat_id=chat_id, rag_handler=self.rag_handler)
+        chat = self.chat_dic[chat_id]
+        return chat.response(message)

model/llm/llm.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from langchain_openai import OpenAI
+import openai
+import sys
+import os
+import requests
+from json import JSONDecodeError
+import time
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+from config import *
+class LLM_API_Call:
+    def __init__(self, type) -> None:
+        if type == "openai":
+            self.llm = OpenAI_API_Call(api_key = LLM_CONFIG[""],
+                                       model = LLM_CONFIG["model"])
+        elif type == "gilas":
+            self.llm = Gilas_API_Call(api_keys = GILAS_API_KEYS,
+                                       model = GILAS_CONFIG["model"],
+                                       base_url = GILAS_CONFIG["base_url"])
+        else:
+            self.llm = OpenAI(
+                            **LLM_CONFIG
+            )
+    def get_LLM_response(self, prompt: str) -> str:
+        return self.llm.invoke(prompt)
+class OpenAI_API_Call:
+    def __init__(self, api_key, model="gpt-4"):
+        self.api_key = api_key
+        openai.api_key = api_key
+        self.model = model
+        self.conversation = []
+    def add_message(self, role, content):
+        self.conversation.append({"role": role, "content": content})
+    def get_response(self):
+        response = openai.ChatCompletion.create(
+            model=self.model,
+            messages=self.conversation
+        )
+        return response['choices'][0]['message']['content']
+    def invoke(self, user_input):
+        self.add_message("user", user_input)
+        response = self.get_response()
+        self.add_message("assistant", response)
+        return response
+class Gilas_API_Call:
+    def __init__(self, api_keys, base_url, model="gpt-4o-mini"):
+        self.api_keys = api_keys
+        self.base_url = base_url
+        self.model = model
+        self.headers = {
+            "Content-Type": "application/json"
+        }
+        self.conversation = []
+        self.retry_wait_time = 30
+    def add_message(self, role, content):
+        self.conversation.append({"role": role, "content": content})
+    def get_response(self, api_key):
+        self.headers["Authorization"] = f"Bearer {api_key}"
+        data = {
+            "model": self.model,
+            "messages": self.conversation
+        }
+        response = requests.post(
+            url=f"{self.base_url}/chat/completions",
+            headers=self.headers,
+            json=data
+        )
+        if response.status_code == 200:
+            try:
+                return response.json()['choices'][0]['message']['content']
+            except (KeyError, IndexError, ValueError) as e:
+                raise Exception(f"Unexpected API response format: {e}")
+        else:
+            raise Exception(f"Gilas API call failed: {response.status_code} - {response.text}")
+    def invoke(self, user_input, max_retries=3):
+        self.add_message("user", user_input)
+        retries = 0
+        while retries < max_retries:
+            for i, api_key in enumerate(self.api_keys):
+                try:
+                    response = self.get_response(api_key)
+                    self.add_message("assistant", response)
+                    return response
+                except (JSONDecodeError, Exception) as e:
+                    print(f"Error encountered with API key {api_key}: {e}. Trying next key...")
+                    # Sleep before trying next key
+                    if i == len(self.api_keys) - 1:
+                        print(f"All keys failed. Retrying oldest key after {self.retry_wait_time} seconds...")
+                        time.sleep(self.retry_wait_time)
+                        self.retry_wait_time += 30  # Increase wait time for next retry
+            retries += 1
+        raise Exception(f"Failed to get a valid response after {max_retries} retries.")

model/processor/case_crawler.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import requests
+from bs4 import BeautifulSoup
+import os
+import warnings
+from tqdm import tqdm
+class Crawler:
+    # This is used for vote separating when list of vote concatenation in string
+    vote_splitter = " |split| "
+    def __init__(self, base_url: str, list_url:str ,
+                 base_vote_url:str , models_path: str , result_path:str):
+        if base_url == "":
+            self.base_url ="https://ara.jri.ac.ir/"
+        else:
+            self.base_url = base_url
+        if list_url == "":
+            self.list_url ="https://ara.jri.ac.ir/Judge/Index"
+        else:
+            self.list_url = list_url
+        if base_vote_url == "":
+            self.base_vote_url ="https://ara.jri.ac.ir/Judge/Text/"
+        else:
+            self.base_vote_url = base_vote_url
+        if models_path == "":
+            self.models_path ="Models/"
+        else:
+            self.models_path = models_path
+        self.pos_model_path = os.path.join(models_path, "postagger.model")
+        self.chunker_path = os.path.join(models_path, "chunker.model")
+        if result_path == "":
+            self.result_path = "Resource/"
+        else:
+            self.result_path = result_path
+        self.merges_vote_path = os.path.join(result_path, 'merged_vote.txt')
+        self.clean_vote_path = os.path.join(result_path, 'clean_vote.txt')
+        self.clean_vote_path_csv = os.path.join(result_path, 'clean_vote.csv')
+        self.selected_vote_path = os.path.join(result_path, 'selected_vote.txt')
+        self.law_list_path = os.path.join(result_path, 'law_list.txt')
+        self.law_clean_list_path = os.path.join(result_path, 'law_clean_list.txt')
+        self.vote_stop_path = os.path.join(result_path, "vote_stopwords.txt")
+        self.law_stop_path = os.path.join(result_path, "law_stopwords.txt")
+    @staticmethod
+    def check_valid_vote(html_soup: BeautifulSoup) -> bool:
+        # Extract title for detection of non-valid vote
+        h1_element = html_soup.find('h1', class_='Title3D')
+        if h1_element is None:
+            return False
+        span_text = h1_element.find('span').text  # Text within the <span> tag
+        full_text = h1_element.text  # Full text within the <h1> element
+        text_after_span = full_text.split(span_text)[-1].strip()  # Extract text after the </span> tag
+        return len(text_after_span) > 0
+    @staticmethod
+    def html_data_extractor(html_soup: BeautifulSoup, vote_splitter: str) -> str:
+        vote_text = html_soup.find('div', id='treeText', class_='BackText')
+        title = html_soup.find('h1', class_='Title3D')
+        info = html_soup.find('td', valign="top", class_="font-size-small")
+        # for separating each vote in file use vote_splitter
+        vote_df = str(title) + str(info) + str(vote_text) + vote_splitter
+        return vote_df
+    def vote_crawler(self, start: int, end: int, separator: int):
+        counter = 0  # For counting right votes crawled
+        result_list = []
+        warnings.filterwarnings("ignore")
+        # Loop for sending request to get each vote page
+        for i in tqdm(range(start, end)):
+            # Save every separator records gotten in .txt format
+            if (counter % separator == 0 and counter > 0) or i == end - 1:
+                text_file = open(os.path.join(self.result_path, f'vote{i}.txt'), "w", encoding='utf-8')
+                text_file.write(''.join(result_list))
+                text_file.close()
+                result_list = []
+            url = self.base_vote_url + f"{i}"
+            response = requests.get(url, verify=False)
+            # Change format for Persian text
+            response.encoding = 'utf-8'
+            resp_text = response.text
+            html_soup = BeautifulSoup(resp_text, 'html.parser')
+            if response.ok and self.check_valid_vote(html_soup):
+                counter += 1
+                vote_df = self.html_data_extractor(html_soup, self.vote_splitter)
+                result_list.append(vote_df)
+    def merge_out_txt(self) -> None:
+        with open(self.result_path, 'w', encoding='utf-8') as outfile:
+            for filename in os.listdir(self.merges_vote_path):
+                if filename.startswith("vote") and filename.endswith('.txt'):  # Only merge vote .txt
+                    with open(os.path.join(self.merges_vote_path, filename), 'r', encoding='utf-8') as infile:
+                        outfile.write(infile.read())
+if __name__ == "__main__":
+    models_path = input("Enter the models path (initial value = https://ara.jri.ac.ir/): ")
+    result_path = input("Enter the result path (initial value = https://ara.jri.ac.ir/Judge/Index): ")
+    base_url = input("Enter the base URL (initial value = https://ara.jri.ac.ir/Judge/Text/): ")
+    list_url = input("Enter the list URL (initial value = Models/ ): ")
+    base_vote_url = input("Enter the base vote URL (initial value = Resource/ ): ")
+    crawler_instance = Crawler(models_path=models_path, result_path=result_path, base_url=base_url, list_url=list_url, base_vote_url=base_vote_url)
+    start = int(input("Enter the start value for vote crawling: "))
+    end = int(input("Enter the end value for vote crawling: "))
+    separator = int(input("Enter the separator value for vote crawling: "))
+    crawler_instance.vote_crawler(start=start, end=end, separator=separator)
+    crawler_instance.merge_out_txt()

model/processor/database_Chunker.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model/processor/law_provider.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+import re
+class LawTxetPreProcessor():
+    def __init__(self, law_texts: list) -> None:
+        self._law_texets = law_texts
+        self._law_name_df = pd.DataFrame(columns=["law_index", "law_name"])
+        self._madeh_df = pd.DataFrame(columns=["law_index", "madeh_index", "madeh_text"])
+        self._is_df = False
+    def build_df(self):
+        title_list = []
+        madeh_list = []
+        madeh_index = []
+        law_index = []
+        counter = 0
+        for text in self._law_texets:
+            title = self.title_extractor(text)
+            title_list.append(title)
+            temp_madeh_list = self.madeh_extractor(text, title == "قانون اساسی جمهوری اسلامی ایران")
+            law_index.extend([counter for i in temp_madeh_list])
+            madeh_index.extend([i+1  for i in range(len(temp_madeh_list))])
+            madeh_list.extend(temp_madeh_list)
+            counter += 1
+        law_index_list = [i for i in range(counter)]
+        self._madeh_df = pd.DataFrame({"law_index": law_index,
+                                    "madeh_index": madeh_index,
+                                    "madeh_text": madeh_list})
+        self._law_name_df = pd.DataFrame({"law_index": law_index_list,
+                                          "law_name": title_list})
+    def title_extractor(self, law_text: str) -> str:
+        first_newline_index = law_text.find('\n')
+        return law_text[:first_newline_index]
+    def madeh_extractor(self, law_text: str, is_asl:False)-> list:
+        result = []
+        pattern = r"(^.{0,1}اصل )" if is_asl else r"(^.{0,1}ماده)"
+        removed_regex = r"❯.*\n"
+        notvalid_pattern = r"(^.{0,1}ماده.*مکرر\n)"
+        cleaned_text = re.sub(removed_regex, "", law_text)
+        matches = re.finditer(pattern, cleaned_text, flags=re.MULTILINE)
+        not_valid_matches = re.finditer(notvalid_pattern, cleaned_text, flags=re.MULTILINE)
+        indices = [match.start() for match in matches]
+        not_valid_indices = [match.start() for match in not_valid_matches]
+        valid_indices = [item for item in indices if item not in not_valid_indices]
+        for i in range(len(valid_indices)):
+            start = valid_indices[i]
+            if i != len(valid_indices)-1:
+                end = valid_indices[i+1]
+                result.append(cleaned_text[start:end])
+            else:
+                result.append(cleaned_text[start:])
+        return result
+    def get_df(self) -> pd.DataFrame:
+        if not self._is_df:
+            self.build_df()
+        return self._law_name_df, self._madeh_df

model/processor/pre_process.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

model/processor/retrieval_rag_nlp_project.ipynb:Zone.Identifier ADDED Viewed

Binary file (27 Bytes). View file

model/propmt/__init__.py ADDED Viewed

File without changes

model/propmt/prompt_handler.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+class Prompt:
+    def get_prompt(self, message:str, info_list: List) -> str:
+        prompt = f"As a user, I want to ask you the following legal question:\n{message}\n\n"
+        if info_list:
+            prompt += "Here are some relevant legal cases and information you should consider:\n"
+            for i, info in enumerate(info_list):
+                prompt += f"case {i+1}:\n{info['title']}\n{info['text']}\n"
+        prompt += "\nBased on the provided information, please respond in Persian(Farsi) with a concise legal analysis.\
+                    Ensure that your response is as summarized and clear as possible. (one paragraph)"
+        return prompt

model/rag/__init__.py ADDED Viewed

File without changes

model/rag/rag_handler.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from typing import List
+import chromadb
+from transformers import AutoTokenizer, AutoModel
+from chromadb.config import Settings
+import torch
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import os
+from hazm import *
+class RAG:
+    def __init__(self,
+                 model_name: str = "HooshvareLab/bert-base-parsbert-uncased",
+                 collection_name: str = "legal_cases",
+                 persist_directory: str = "chromadb_collections/",
+                 top_k: int = 2
+                 ) -> None:
+        self.cases_df = pd.read_csv('processed_cases.csv')
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.normalizer = Normalizer()
+        self.top_k = top_k
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.client = chromadb.PersistentClient(path=persist_directory)
+        self.collection = self.client.get_collection(name=collection_name)
+    def query_pre_process(self, query: str) -> str:
+        return self.normalizer.normalize(query)
+    def embed_single_text(self, text: str) -> np.ndarray:
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+    def extract_case_title_from_df(self, case_id: str) -> str:
+        case_id_int = int(case_id.split("_")[1])
+        try:
+            case_title = self.cases_df.loc[case_id_int, 'title']
+            return case_title
+        except KeyError:
+            return "Case ID not found in DataFrame."
+    def extract_case_text_from_df(self, case_id: str) -> str:
+        case_id_int = int(case_id.split("_")[1])
+        try:
+            case_text = self.cases_df.loc[case_id_int, 'text']
+            return case_text
+        except KeyError:
+            return "Case ID not found in DataFrame."
+    def retrieve_relevant_cases(self, query_text: str) -> List[str]:
+        normalized_query_text = self.query_pre_process(query_text)
+        query_embedding = self.embed_single_text(normalized_query_text)
+        query_embedding_list = query_embedding.tolist()
+        results = self.collection.query(
+            query_embeddings=[query_embedding_list],
+            n_results=self.top_k
+        )
+        retrieved_cases = []
+        for i in range(len(results['metadatas'][0])):
+            case_id = results['ids'][0][i]
+            case_text = self.extract_case_text_from_df(case_id)
+            case_title = self.extract_case_title_from_df(case_id)
+            retrieved_cases.append({
+                "text": case_text,
+                "title": case_title
+            })
+        return retrieved_cases
+    def get_information(self, query: str) -> List[str]:
+        return self.retrieve_relevant_cases(query)

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+#dataset
+datasets
+pandas
+numpy
+indexed_gzip
+# json
+matrix-nio[e2e]
+opsdroid
+python-dotenv
+BeautifulSoup4
+requests
+tqdm
+hazm
+spacy
+rank_bm25
+openai
+gradio
+langchain_openai
+sentence-transformers
+chromadb
+rarfile
+patool