import os
import shutil
os.environ["COHERE_API_KEY"] = "XmuSA1m4EJwHAvjzM5FqRRWKZcmbZVmUlQgztaUJ"
from dotenv import load_dotenv
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredFileIOLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain_community.document_loaders import UnstructuredCSVLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import os
import google.generativeai as genai
import re
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
import pandas as pd
import json

def process_json_file(file_path):
    """
    Hàm này đọc một file JSON và xử lý các loại JSON khác nhau trong đó.
    Trả về một list chứa tất cả các đối tượng JSON trong file.
    """
    json_data = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                json_data.append(data)
            except json.JSONDecodeError:
                try:
                    data = json.loads(line[:-1])
                    json_data.append(data)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
    return json_data


from groq import Groq

client = Groq(
    api_key="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",
)

# Cấu hình Google API
os.environ["COHERE_API_KEY"] = "6H7ZPI4aDoGFcELYJ0KLdVYTqcbzN4wvvUcIBSLl"
genai.configure(api_key="AIzaSyB3j7vAOJBL4MnWPk8VJJM1Yg33YTZEBv0")
os.environ["GOOGLE_API_KEY"] = "AIzaSyB3j7vAOJBL4MnWPk8VJJM1Yg33YTZEBv0"
# Mô hình embedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

def extract_file_names(text):
    # Sử dụng biểu thức chính quy để tìm các cụm từ kết hợp có dạng "tênfile.đuôifile"
    file_names = re.findall(r'\b(\w+\.\w+)\b', text)
    
    return file_names

def extract_multi_metadata_content(texts, tests):
    extracted_content = ""
    for idx, test in enumerate(tests):
        temp_content = ""
        for x in texts:
            metadata_lower = x.metadata['source'].lower()  # Chuyển nội dung metadata về dạng chữ thường
            if any(term.lower() in metadata_lower for term in test.split()):  # Kiểm tra từng phần của test trong metadata_lower
                temp_content += x.page_content
        if idx == 0:  # Nếu là lần lặp đầu tiên
            extracted_content += f"Dữ liệu của {test}:\n {temp_content}"
        else:
            extracted_content += "\n" + temp_content + "\n"
    return extracted_content

def extract_filename(text):
    # Tìm các từ tiếp theo sau "file" hoặc "tập tin" trong câu
    matches = re.findall(r'\b(?:file|tập\s+tin)\s+(\w+)\b', text.lower())
    return matches

def extract_all_filenames_1(text):
    all_filenames = []
    filenames_1 = extract_file_names(text)
    filenames_2 = extract_filename(text)
    all_filenames.extend(filenames_1)
    all_filenames.extend(filenames_2)
    return all_filenames

def extract_all_filenames(text):
    # Sử dụng biểu thức chính quy để tìm các cụm từ kết hợp có dạng "tênfile.đuôifile"
    file_names_1 = re.findall(r'\b(\w+)\.\w+\b', text)
    
    # Tìm các từ tiếp theo sau "file" hoặc "tập tin" trong câu
    file_names_2 = re.findall(r'\b(?:file|tập\s+tin)\s+(\w+)\b', text.lower())
    
    # Gộp danh sách các tên tệp lại thành một danh sách duy nhất
    all_filenames = file_names_1 + file_names_2
    
    # Tạo một danh sách để lưu trữ các tên tệp duy nhất
    unique_filenames = []
    seen_filenames = set()  # Tập hợp để kiểm tra tên tệp đã xuất hiện
    
    for filename in all_filenames:
        lowercase_filename = filename.lower()
        # Nếu tên tệp chưa xuất hiện trong tập hợp đã thấy, thêm vào danh sách duy nhất và tập hợp đã thấy
        if lowercase_filename not in seen_filenames:
            unique_filenames.append(filename)
            seen_filenames.add(lowercase_filename)
    
    return unique_filenames

def find_matching_files_in_docs(unique_filenames):
    folder_path = "/code/temp"  # Thay đổi đường dẫn tùy thuộc vào thư mục mà bạn lưu trữ các tệp
    
    matching_files = []
    seen_filenames = set()  # Tập hợp để kiểm tra các tên tệp đã xuất hiện
    
    for filename in unique_filenames:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if filename.lower() in file.lower():
                    if file.lower() not in seen_filenames:
                        matching_files.append(file)
                        seen_filenames.add(file.lower())
                        break  # Nếu tìm thấy một tệp trùng, chuyển sang tên tệp tiếp theo
    return matching_files
def find_matching_files_in_docs_12_id(text,id):
    folder_path = f"/code/temp/{id}"
    
    # Tạo danh sách để lưu trữ các từ cần tra trong thư mục docs và trong câu truy vấn
    search_terms = []
    search_terms_old = []
    matching_index = []

    
    search_origin = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text)

# Tạo danh sách để lưu trữ các từ cần tìm kiếm, không tách các từ có đuôi file
    search_terms_origin = []
    for word in search_origin:
    # Kiểm tra xem từ có đuôi file không
       if '.' in word:
        search_terms_origin.append(word)
       else:
        # Nếu từ không có đuôi file, tách thành các từ riêng lẻ
        search_terms_origin.extend(re.findall(r'\b\w+\b', word))

    # Tìm tất cả các cụm từ có dạng "tênfile.đuôifile" trong câu và thêm chúng vào danh sách tìm kiếm
    file_names_with_extension = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text.lower())
    file_names_with_extension_old = re.findall(r'\b(\w+\.\w+)\b', text)
    for file_name in search_terms_origin:
        # Kiểm tra xem tên tệp có chứa đuôi file không
        if "." in file_name:
            term_position = search_terms_origin.index(file_name)
            search_terms_old.append(file_name)
    for file_name in file_names_with_extension_old:
        # Kiểm tra xem tên tệp có chứa đuôi file không
        if "." in file_name:
            search_terms_old.append(file_name)
    for file_name in file_names_with_extension:
        # Kiểm tra xem tên tệp có chứa đuôi file không
            search_terms.append(file_name)

    # Tạo biến tạm thời để lưu trữ câu truy vấn sau khi đã loại bỏ các từ tên file.đuôi file
    clean_text_old = text
    clean_text = text.lower()
    for term in search_terms_old:
        clean_text_old = clean_text_old.replace(term, '')
    for term in search_terms:
        clean_text = clean_text.replace(term, '')

    # Tách câu đã xóa các từ tên file.đuôi file thành các từ riêng lẻ và thêm chúng vào danh sách tìm kiếm
    words = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text)
    #search_terms.extend(words)

    words_old = re.findall(r'\b\w+\b', clean_text_old)
    search_terms_old.extend(words_old)

    # Tạo danh sách để duy trì thứ tự của tệp và từ được tìm thấy
     # Tạo tập hợp để lưu trữ các tệp trùng lặp (nếu có)
    matching_files = set()
    matching_files_old = set()
    
    # Tìm các tệp trong thư mục "docs" mà có từ trong danh sách tìm kiếm
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            for term in search_terms:
                if term.lower() in file.lower():
                    term_position = search_terms.index(term)
                    term_value = search_terms_origin[term_position]
                    matching_files.add(file)
                    matching_index.append(term_position)
                    break  # Dừng việc so sánh nếu đã tìm thấy tệp phù hợp
    matching_files_old1 = []
    matching_index.sort()
    for x in matching_index:
        matching_files_old1.append(search_terms_origin[x])
    
    return matching_files,matching_files_old1

def find_matching_files_in_docs_12(text):
    # Thư mục "docs"
    folder_path = "/code/temp"
    
    # Tạo danh sách để lưu trữ các từ cần tra trong thư mục docs và trong câu truy vấn
    search_terms = []
    search_terms_old = []
    matching_index = []

    
    search_origin = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text)

# Tạo danh sách để lưu trữ các từ cần tìm kiếm, không tách các từ có đuôi file
    search_terms_origin = []
    for word in search_origin:
    # Kiểm tra xem từ có đuôi file không
       if '.' in word:
        search_terms_origin.append(word)
       else:
        # Nếu từ không có đuôi file, tách thành các từ riêng lẻ
        search_terms_origin.extend(re.findall(r'\b\w+\b', word))

    # Tìm tất cả các cụm từ có dạng "tênfile.đuôifile" trong câu và thêm chúng vào danh sách tìm kiếm
    file_names_with_extension = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text.lower())
    file_names_with_extension_old = re.findall(r'\b(\w+\.\w+)\b', text)
    for file_name in search_terms_origin:
        # Kiểm tra xem tên tệp có chứa đuôi file không
        if "." in file_name:
            term_position = search_terms_origin.index(file_name)
            search_terms_old.append(file_name)
    for file_name in file_names_with_extension_old:
        # Kiểm tra xem tên tệp có chứa đuôi file không
        if "." in file_name:
            search_terms_old.append(file_name)
    for file_name in file_names_with_extension:
        # Kiểm tra xem tên tệp có chứa đuôi file không
            search_terms.append(file_name)

    # Tạo biến tạm thời để lưu trữ câu truy vấn sau khi đã loại bỏ các từ tên file.đuôi file
    clean_text_old = text
    clean_text = text.lower()
    for term in search_terms_old:
        clean_text_old = clean_text_old.replace(term, '')
    for term in search_terms:
        clean_text = clean_text.replace(term, '')

    # Tách câu đã xóa các từ tên file.đuôi file thành các từ riêng lẻ và thêm chúng vào danh sách tìm kiếm
    words = re.findall(r'\b\w+\.\w+\b|\b\w+\b', text)
    #search_terms.extend(words)

    words_old = re.findall(r'\b\w+\b', clean_text_old)
    search_terms_old.extend(words_old)

    # Tạo danh sách để duy trì thứ tự của tệp và từ được tìm thấy
     # Tạo tập hợp để lưu trữ các tệp trùng lặp (nếu có)
    matching_files = set()
    matching_files_old = set()
    
    # Tìm các tệp trong thư mục "docs" mà có từ trong danh sách tìm kiếm
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            for term in search_terms:
                if term.lower() in file.lower():
                    term_position = search_terms.index(term)
                    term_value = search_terms_origin[term_position]
                    matching_files.add(file)
                    matching_index.append(term_position)
                    break  # Dừng việc so sánh nếu đã tìm thấy tệp phù hợp
    matching_files_old1 = []
    matching_index.sort()
    for x in matching_index:
        matching_files_old1.append(search_terms_origin[x])
    
    return matching_files,matching_files_old1

def separate_csv_xlsx(files_list):
    list_csv = []
    list_other = []

    for file in files_list:
        if file.endswith('.csv') or file.endswith('.xlsx'):
            list_csv.append(file)
        else:
            list_other.append(file)

    return list_csv, list_other

def convert_xlsx_to_csv(xlsx_file_path, csv_file_path):
    # Read the XLSX file
    df = pd.read_excel(xlsx_file_path)
    df.to_csv(csv_file_path, index=False)

def save_list_CSV_id(file_list,id):
    text = ""  # Khởi tạo biến text ở đây để lưu toàn bộ nội dung từ tất cả các tệp
    for x in file_list:
        if x.endswith('.xlsx'):
            old = f"/code/temp/{id}/{x}"
            new = old.replace(".xlsx", ".csv")
            convert_xlsx_to_csv(old, new)
            x = x.replace(".xlsx", ".csv")  # Cập nhật giá trị của x thành tên file CSV mới
        loader1 = CSVLoader(f"/code/temp/{id}/{x}") 
        print(x)
        docs1 = loader1.load()
        text += f"Dữ liệu file {x}:\n"  # Thêm dòng chữ trước nội dung từ mỗi tệp
        for z in docs1:
            text += z.page_content + "\n" # Thêm "\n" để tạo xuống hàng
        
    return text

def save_list_CSV(file_list):
    text = ""  # Khởi tạo biến text ở đây để lưu toàn bộ nội dung từ tất cả các tệp
    for x in file_list:
        if x.endswith('.xlsx'):
            old = f"/code/temp/{x}"
            new = old.replace(".xlsx", ".csv")
            convert_xlsx_to_csv(old, new)
            x = x.replace(".xlsx", ".csv")  # Cập nhật giá trị của x thành tên file CSV mới
        loader1 = CSVLoader(f"/code/temp/{x}") 
        print(x)
        docs1 = loader1.load()
        text += f"Dữ liệu file {x}:\n"  # Thêm dòng chữ trước nội dung từ mỗi tệp
        for z in docs1:
            text += z.page_content + "\n" # Thêm "\n" để tạo xuống hàng
        
    return text

def extract_query(query,text_alls):
    keyword = find_matching_files_in_docs_12(query)
    list_csv, list_other = separate_csv_xlsx(keyword)
    test_csv = save_list_CSV(list_csv)
    my_set = set(list_other)
    text_document = extract_multi_metadata_content(text_alls,my_set)
    test_all = test_csv + text_document

    return test_all

def chat_gemini(query,text_merge):
    prompt = f"Dựa vào nội dung sau:{text_merge}. Hãy trả lời câu hỏi sau đây: {query}"
    # Set up the model
    generation_config = {
     "temperature": 0.0,
     "top_p": 0.0,
     "top_k": 0,
     "max_output_tokens": 8192,
    }

    safety_settings = [
      {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
   {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
   },
   {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
   },
   {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
    },
   ]

    model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
                              generation_config=generation_config,
                              safety_settings=safety_settings)

    convo = model.start_chat(history=[])
    convo.send_message(f"{prompt}")
    answer = convo.last.text

    return answer

def extract_content_between_keywords(query, keywords):
    contents = {}
    num_keywords = len(keywords)
    keyword_positions = []
    
    for i in range(num_keywords):
        keyword = keywords[i]
        # Xác định vị trí của từ khóa trong câu truy vấn
        keyword_position = query.find(keyword)
        keyword_positions.append(keyword_position)
        
        # Nếu từ khóa không tồn tại trong câu truy vấn, bỏ qua
        if keyword_position == -1:
            continue
        
        # Tìm vị trí của từ khóa tiếp theo sau từ khóa hiện tại
        next_keyword_position = len(query)
        for j in range(i + 1, num_keywords):
            next_keyword = keywords[j]
            next_keyword_position = query.find(next_keyword)
            if next_keyword_position != -1:
                break
        
        # Trích xuất nội dung trước từ khóa đầu tiên
        if i == 0:
            content_before = query[:keyword_position].strip()
        else:
            content_before = query[keyword_positions[i-1] + len(keywords[i-1]):keyword_position].strip()
        
        # Trích xuất nội dung sau từ khóa cuối cùng
        if i == num_keywords - 1:
            content_after = query[keyword_position + len(keyword):].strip()
        else:
            content_after = query[keyword_position + len(keyword):next_keyword_position].strip()
        
        # Ghép từ khóa với nội dung trước và sau để tạo thành câu hoàn chỉnh
        content = f"{content_before} {keyword} {content_after}"
        
        # Lưu câu hoàn chỉnh vào từ điển
        contents[keyword] = content
    
    return contents

def merge_files(file_set, file_list):
    """Hàm này ghép lại các tên file dựa trên điều kiện đã cho."""
    merged_files = {}
    
    # Ghép lại các tên file từ file_list
    for file_name in file_list:
        name = file_name.split('.')[0]
        for f in file_set:
            if name in f:
                merged_files[name] = f
                break
        
    return merged_files

def replace_keys_with_values(original_dict, replacement_dict):
    """
    Thay thế các key trong original_dict bằng các giá trị tương ứng từ replacement_dict.
    
    Tham số:
        - original_dict: Từ điển gốc cần thay đổi.
        - replacement_dict: Từ điển chứa các cặp key-value sẽ được sử dụng để thay thế key trong original_dict.
    
    Trả về:
        - new_dict: Từ điển mới sau khi thực hiện thay đổi.
    """
    new_dict = {}
    for key, value in original_dict.items():
        if key in replacement_dict:
            new_key = replacement_dict[key]
            new_dict[new_key] = value
        else:
            new_dict[key] = value
    return new_dict

def aws1_csv(new_dict_csv):
 text = ""
 query_all = ""
 for key, value in new_dict_csv.items():
    print(key,value)
    query = value
    query_all += value
    keyword = []
    keyword.append(key)
    print(keyword)
    test = save_list_CSV(keyword)
    text += test
 return text,query_all

def aws1_csv_id(new_dict_csv,id):
 text = ""
 query_all = ""
 for key, value in new_dict_csv.items():
    print(key,value)
    query = value
    query_all += value
    keyword = []
    keyword.append(key)
    print(keyword)
    test = save_list_CSV_id(keyword,id)
    text += test
 return text,query_all


def aws1(new_dict,text_alls):
 text = ""
 query_all = ""
 for key, value in new_dict.items():
    query = value
    query_all += value
    keyword,keyword2=find_matching_files_in_docs_12(query)
    print(value)
    print(keyword)
    data= extract_multi_metadata_content(text_alls,keyword)
    #Phân chia dữ liệu này lại và rerank
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=400)
    texts_data = text_splitter.split_text(data)
    persist_directory = f'{key}'
    vectordb = Chroma.from_texts(texts_data,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
    k_1 = round(len(texts_data))
    
    retriever = vectordb.as_retriever(search_kwargs={f"k":k_1})
    llm = Cohere(temperature=0)
    compressor = CohereRerank()
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
    )

    compressed_docs = compression_retriever.get_relevant_documents(
    f"{query}"
    )
    text += "Dữ liệu file" + f"{key}"
    i =0 
    for x in compressed_docs:
      text += x.page_content
      i= i +1
    
 return text,query_all
    
from groq import Groq

def get_chat_completion(prompt_query):
 try:
  chat_completion = client.chat.completions.create(
    #
    # Required parameters
    #
    messages=[
        {
            "role": "system",
            "content": "Bạn là một trợ lý trung thưc, trả lời dựa trên nội dung tài liệu được cung cấp. Chỉ trả lời liên quan đến câu hỏi một cách đầy đủ chính xác, không bỏ sót thông tin."
        },
        {
            "role": "user",
            "content": f"{prompt_query}",
        }
    ],

    # The language model which will generate the completion.
    model="llama3-70b-8192",
    temperature=0.0,
    # The maximum number of tokens to generate. Requests can use up to
    # 2048 tokens shared between prompt and completion.
    max_tokens=9000,

    # Controls diversity via nucleus sampling: 0.5 means half of all
    # likelihood-weighted options are considered.
    #top_p=1,

    # A stop sequence is a predefined or user-specified text string that
    # signals an AI to stop generating content, ensuring its responses
    # remain focused and concise. Examples include punctuation marks and
    # markers like "[end]".
    stop=None,

    # If set, partial message deltas will be sent.
    stream=False,
  )
  return chat_completion.choices[0].message.content
 except Exception as error:
    # Handle the RateLimitError here
    #print("Rate limit reached. Please try again later.")
    #print("Error message:", error.message)
    return False
# Print the completion returned by the LLM.

def initialize_generative_model(prompt):
    # Set up the model generation configuration
    generation_config = {
        "temperature": 0.0,
        "top_p": 0.0,
        "top_k": 0,
        "max_output_tokens": 8192,
    }

    # Define safety settings
    safety_settings = [
        {
            "category": "HARM_CATEGORY_HARASSMENT",
            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
        },
        {
            "category": "HARM_CATEGORY_HATE_SPEECH",
            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
        },
        {
            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
        },
        {
            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
        },
    ]

    # Initialize the generative model
    model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
                                  generation_config=generation_config,
                                  safety_settings=safety_settings)

    # Start a conversation
    convo = model.start_chat(history=[])

    # Send the prompt
    convo.send_message(prompt)

    # Return the last response
    return convo.last.text

# Example usage:

def question_answer(question):
    completion = get_chat_completion(question)
    if completion:
        return completion
    else:
        answer = initialize_generative_model(question)
        return answer


def aws1_all_id(new_dict,text_alls,id):
 answer = ""
 os.environ["COHERE_API_KEY"] = "ENvwSoOlorjRZvVkgmdRsAGCJKJdsNDHXx1sypFG"
 for key, value in new_dict.items():
    query = value
    keyword,keyword2=find_matching_files_in_docs_12_id(query,id)
    print(value)
    print(keyword)
    data= extract_multi_metadata_content(text_alls,keyword)
    #Phân chia dữ liệu này lại và rerank
    text_splitter = CharacterTextSplitter(chunk_size=2200, chunk_overlap=1500)
    texts_data = text_splitter.split_text(data)

    persist_directory = f"/code/temp/{id}/vector_db/{key}"
    vectordb_query = Chroma.from_texts(texts_data,
                                 embedding=embeddings,
                                 persist_directory=persist_directory
                                 )
    k_1 = len(texts_data)
    
    retriever = vectordb_query.as_retriever(search_kwargs={f"k":k_1})
    llm = Cohere(temperature=0)
    compressor = CohereRerank(top_n=3)
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
    )

    compressed_docs = compression_retriever.get_relevant_documents(
    f"{query}"
    )
    text = ""
    text += "Dữ liệu file" + f"{key}"
    i =0 
    for x in compressed_docs:
      text += x.page_content
      i= i +1
    
    prompt_document = f"Dựa vào nội dung sau:{text}. Hãy trả lời câu hỏi sau đây: {query}. Mà không thay đổi nội dung mà mình đã cung cấp"
    answer_for = question_answer(prompt_document)
    answer += answer_for + "\n"
    shutil.rmtree(persist_directory)

    
 return answer

def aws1_all(new_dict,text_alls):
 answer = ""
 for key, value in new_dict.items():
    query = value
    keyword,keyword2=find_matching_files_in_docs_12(query)
    print(value)
    print(keyword)
    data= extract_multi_metadata_content(text_alls,keyword)
    #Phân chia dữ liệu này lại và rerank
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=400)
    texts_data = text_splitter.split_text(data)
    persist_directory = f'{key}'
    vectordb = Chroma.from_texts(texts_data,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
    k_1 = round(len(texts_data))
    
    retriever = vectordb.as_retriever(search_kwargs={f"k":k_1})
    llm = Cohere(temperature=0)
    compressor = CohereRerank()
    compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
    )

    compressed_docs = compression_retriever.get_relevant_documents(
    f"{query}"
    )
    text = ""
    text += "Dữ liệu file" + f"{key}"
    i =0 
    for x in compressed_docs:
      text += x.page_content
      i= i +1
    
    prompt_document = f"Dựa vào nội dung sau:{text}. Hãy trả lời câu hỏi sau đây: {query}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
    answer_for = question_answer(prompt_document)
    answer += answer_for + "\n"

    
 return answer

def question_answer_all_query_v1(query,text_alls):
    keyword1,key_words_old=find_matching_files_in_docs_12(query)
    list_keywords2 = list(key_words_old)
    contents1 = extract_content_between_keywords(query, list_keywords2)
    merged_result = merge_files(keyword1,list_keywords2)
    original_dict = contents1
    # Từ điển replacement
    replacement_dict = merged_result
    new_dict = replace_keys_with_values(original_dict, replacement_dict)

    files_to_remove = [filename for filename in new_dict.keys() if filename.endswith('.xlsx') or filename.endswith('.csv')]
    removed_files = {}

    for filename in files_to_remove:
       removed_files[filename] = new_dict[filename]

# Xóa các tệp khỏi new_dict
    for filename in files_to_remove:
      new_dict.pop(filename)
    test_csv = ""
    text_csv,query_csv = aws1_csv(removed_files)
    prompt_csv = ""
    answer_csv = ""
    if test_csv:
        prompt_csv = f"Dựa vào nội dung sau: {text_csv}. Hãy trả lời câu hỏi sau đây: {query_csv}.Bằng tiếng Việt"
        answer_csv = question_answer(prompt_csv)

    answer_document = aws1_all(new_dict,text_alls)

    
    answer_all = answer_document + answer_csv
    
    return answer_all

def check_both_empty(matching_files, matching_files_old):
    """
    Kiểm tra nếu cả hai biến đều rỗng.

    Args:
    - matching_files: Danh sách các tệp phù hợp mới.
    - matching_files_old: Danh sách các tệp phù hợp cũ.

    Returns:
    - True nếu cả hai biến đều rỗng, False nếu không.
    """
    return not matching_files and not matching_files_old

def question_answer_all_query(query):
    keyword1,key_words_old=find_matching_files_in_docs_12(query)
    list_keywords2 = list(key_words_old)
    contents1 = extract_content_between_keywords(query, list_keywords2)
    merged_result = merge_files(keyword1,list_keywords2)
    original_dict = contents1
    # Từ điển replacement
    replacement_dict = merged_result
    new_dict = replace_keys_with_values(original_dict, replacement_dict)

    files_to_remove = [filename for filename in new_dict.keys() if filename.endswith('.xlsx') or filename.endswith('.csv')]
    removed_files = {}

    for filename in files_to_remove:
       removed_files[filename] = new_dict[filename]

# Xóa các tệp khỏi new_dict
    for filename in files_to_remove:
      new_dict.pop(filename)

    text_document, query_document = aws1(new_dict)
    test_csv = ""
    text_csv,query_csv = aws1_csv(removed_files)
    if test_csv:
       prompt_csv = f"Dựa vào nội dung sau: {text_csv}. Hãy trả lời câu hỏi sau đây: {query_csv}.Bằng tiếng Việt"
    prompt_document = f"Dựa vào nội dung sau: {text_document}. Hãy trả lời câu hỏi sau đây: {query_document}. Bằng tiếng Việt"
    
    
    answer_document = question_answer(prompt_document)

    answer_csv = question_answer(prompt_csv)

    answer_all = answer_document + answer_csv
    
    return answer_all

def extract_data():
 documents = []
# Load dữ liệu các file từ thư mục docs
 for file in os.listdir("/code/temp"):
    if file.endswith(".pdf"):
        pdf_path = "/code/temp/" + file
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "/code/temp/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        txt_path = "/code/temp/" + file
        loader = TextLoader(txt_path,encoding="utf8")
        documents.extend(loader.load())
    elif file.endswith('.pptx'):
        ppt_path = "/code/temp/" + file
        loader = UnstructuredPowerPointLoader(ppt_path)
        documents.extend(loader.load())
    elif file.endswith('.csv'):
        csv_path = "/code/temp/" + file
        loader = UnstructuredCSVLoader(csv_path)
        documents.extend(loader.load())
    elif file.endswith('.xlsx'):
        excel_path = "/code/temp/" + file
        loader = UnstructuredExcelLoader(excel_path)
        documents.extend(loader.load())
    elif file.endswith('.xml'):
        xml_path = "/code/temp/" + file
        loader = UnstructuredXMLLoader(xml_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        html_path = "/code/temp/" + file
        loader = UnstructuredHTMLLoader(html_path)
        documents.extend(loader.load())
    elif file.endswith('.json'):
        json_path = "/code/temp/" + file
        loader = JSONLoader(json_path)
        documents.extend(loader.load())
    elif file.endswith('.md'):
        json_path = "/code/temp/" + file
        loader = UnstructuredMarkdownLoader(json_path)
        documents.extend(loader.load())  
 #Phân chia dữ liệu
 text_splitter = CharacterTextSplitter(chunk_size=2200, chunk_overlap=1500)
 texts= text_splitter.split_documents(documents)
 text_all = texts

 return text_all


def extract_data2(id):
 documents = []
# Load dữ liệu các file từ thư mục docs
 for file in os.listdir(f"/code/temp/{id}"):
    if file.endswith(".pdf"):
        pdf_path = f"/code/temp/{id}/" + file
        loader = UnstructuredPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = f"/code/temp/{id}/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        txt_path = f"/code/temp/{id}/" + file
        loader = TextLoader(txt_path,encoding="utf8")
        documents.extend(loader.load())
    elif file.endswith('.pptx'):
        ppt_path = f"/code/temp/{id}/" + file
        loader = UnstructuredPowerPointLoader(ppt_path)
        documents.extend(loader.load())
    elif file.endswith('.csv'):
        csv_path = f"/code/temp/{id}/" + file
        loader = UnstructuredCSVLoader(csv_path)
        documents.extend(loader.load())
    elif file.endswith('.xlsx'):
        excel_path = f"/code/temp/{id}/" + file
        loader = UnstructuredExcelLoader(excel_path)
        documents.extend(loader.load())
    elif file.endswith('.xml'):
        xml_path = f"/code/temp/{id}/" + file
        loader = UnstructuredXMLLoader(xml_path)
        documents.extend(loader.load())
    elif file.endswith('.html'):
        html_path = f"/code/temp/{id}/" + file
        loader = UnstructuredHTMLLoader(html_path)
        documents.extend(loader.load())
    elif file.endswith('.json'):
        json_path = f"/code/temp/{id}/" + file
        loader = TextLoader(json_path)
        documents.extend(loader.load())
    elif file.endswith('.md'):
        json_path = f"/code/temp/{id}/" + file
        loader = UnstructuredMarkdownLoader(json_path)
        documents.extend(loader.load())  
 #Phân chia dữ liệu
 text_splitter = CharacterTextSplitter(chunk_size=2200, chunk_overlap=1500)
 texts= text_splitter.split_documents(documents)
 text_all = texts

 return text_all

def question_answer_all_query_v1_id(query,text_alls,id):
    keyword1,key_words_old=find_matching_files_in_docs_12_id(query,id)
    list_keywords2 = list(key_words_old)
    contents1 = extract_content_between_keywords(query, list_keywords2)
    merged_result = merge_files(keyword1,list_keywords2)
    original_dict = contents1
    # Từ điển replacement
    replacement_dict = merged_result
    new_dict = replace_keys_with_values(original_dict, replacement_dict)

    files_to_remove = [filename for filename in new_dict.keys() if filename.endswith('.xlsx') or filename.endswith('.csv')]
    removed_files = {}

    for filename in files_to_remove:
       removed_files[filename] = new_dict[filename]

# Xóa các tệp khỏi new_dict
    for filename in files_to_remove:
      new_dict.pop(filename)
    test_csv = ""
    text_csv,query_csv = aws1_csv_id(removed_files,id)
    prompt_csv = ""
    answer_csv = ""
    if test_csv:
        prompt_csv = f"Dựa vào nội dung sau: {text_csv}. Hãy trả lời câu hỏi sau đây: {query_csv}.Bằng tiếng Việt"
        answer_csv = question_answer(prompt_csv)

    answer_document = aws1_all_id(new_dict,text_alls,id)

    
    answer_all = answer_document + answer_csv
    
    return answer_all

from typing import List, Optional

from langchain_core.pydantic_v1 import BaseModel, Field


class Search(BaseModel):

    queries: List[str] = Field(
        ...,
        description="Truy vấn riêng biệt để tìm kiếm, giữ nguyên ý chính câu hỏi riêng biệt",
    )

"sk-proj-TbTRQKKJPb7PEoIBJ6e7T3BlbkFJbdz7Qxsv3JJ7BFcmN2jf"
openai_key = "sk" + "-proj-TbTRQKKJPb7PEoIBJ6e7T3BlbkFJbdz7Qxsv3JJ7BFcmN2jf"
os.environ["OPENAI_API_KEY"] = "sk-proj-TbTRQKKJPb7PEoIBJ6e7T3BlbkFJbdz7Qxsv3JJ7BFcmN2jf"
from langchain_core.output_parsers.openai_tools import PydanticToolsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

def query_analyzer(query):

  output_parser = PydanticToolsParser(tools=[Search])

  system = """Bạn có khả năng đưa ra các truy vấn tìm kiếm chính xác để lấy thông tin giúp trả lời các yêu cầu của người dùng. Các truy vấn của bạn phải chính xác, không được bỏ ngắn rút gọn.

   Nếu bạn cần tra cứu hai hoặc nhiều thông tin riêng biệt, bạn có thể làm điều đó!. Trả lời câu hỏi bằng tiếng Việt(Vietnamese), không được dùng ngôn ngữ khác"""
  prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
    )
  llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.0)
  structured_llm = llm.with_structured_output(Search)
  query_analyzer = {"question": RunnablePassthrough()} | prompt | structured_llm
  text = query_analyzer.invoke(query)
  return text

def handle_query_upgrade_keyword(query_all,text_all,id,chat_history):
     answer_all = ""
     os.environ["COHERE_API_KEY"] = "RWIrMiF184xBmOAAvodxCxExefwEAYSn7yEOmbk1"
     prompt_history = f"Dựa vào nội dung lịch sử câu hỏi sau:{chat_history}. Hãy trả lời câu hỏi sau đây: {query_all}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
     answer_history = "Câu trả lời sau khi xem qua lịch sử chat:" + question_answer(prompt_history)
     answer_all += answer_history
     # Xử lý multiquery:
    #  test = query_analyzer.invoke(f"{query_all}")
     test = query_analyzer(query_all)
     test_string = str(test)
     matches = re.findall(r"'([^']*)'", test_string)

     # Xử lý dữ liệu
    #  text_all = extract_data2(id)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=2500)
     texts = text_splitter.split_documents(text_all)
     vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embeddings)
     # Thiết lập số k cần truy vấn
     k = len(texts)
     retriever = vectordb.as_retriever(search_kwargs={"k":k})
     llm = Cohere(temperature=0)
     compressor = CohereRerank(top_n=5,model="rerank-english-v2.0")
     compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
            )
     for x in matches:
        query = x
        keyword,key_words_old = find_matching_files_in_docs_12_id(query,id)
        file_list = keyword
        if file_list:
              list_keywords2 = list(key_words_old)
              contents1 = extract_content_between_keywords(query, list_keywords2)
              merged_result = merge_files(keyword,list_keywords2)
              original_dict = contents1
    # Từ điển replacement
              replacement_dict = merged_result
              new_dict = replace_keys_with_values(original_dict, replacement_dict)

              files_to_remove = [filename for filename in new_dict.keys() if filename.endswith('.xlsx') or filename.endswith('.csv')]
              removed_files = {}

              for filename in files_to_remove:
                  removed_files[filename] = new_dict[filename]

# Xóa các tệp khỏi new_dict
              for filename in files_to_remove:
                  new_dict.pop(filename)
              test_csv = ""
              text_csv,query_csv = aws1_csv_id(removed_files,id)
              prompt_csv = ""
              answer_csv = ""
              if test_csv:
                     prompt_csv = f"Dựa vào nội dung sau: {text_csv}. Hãy trả lời câu hỏi sau đây: {query_csv}.Bằng tiếng Việt"
                     answer_csv = question_answer(prompt_csv)
              
            #   text_document, query_document = aws1_all_id(new_dict,text_all,id)
            # #   answer_document = sf.aws1_all(new_dict,text_alls,id)
            #   prompt_document = f"Dựa vào nội dung sau: {text_document}. Hãy trả lời câu hỏi sau đây: {query_document}. Bằng tiếng Việt"
              answer_document = aws1_all_id(new_dict,text_all,id)

              answer_all1 = answer_document + answer_csv
    
              answer_all += answer_all1

        else:
          compressed_docs = compression_retriever.get_relevant_documents(f"{query}")
          relevance_score_float = float(compressed_docs[0].metadata['relevance_score'])
         
          if  relevance_score_float <= 0.82:
               documents1 = []
               for file in os.listdir(f"/code/temp/{id}"):
                  if file.endswith('.csv'):
                     csv_path = f"/code/temp/{id}/" + file
                     loader = UnstructuredCSVLoader(csv_path)
                     documents1.extend(loader.load())
                  elif file.endswith('.xlsx'):
                     excel_path = f"/code/temp/{id}/" + file
                     loader = UnstructuredExcelLoader(excel_path)
                     documents1.extend(loader.load())
               
               text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
               texts_csv= text_splitter_csv.split_documents(documents1)
               vectordb_csv = Chroma.from_documents(documents=texts_csv,
                                 embedding=embeddings,persist_directory=f'./demoaaa')
     # Thiết lập số k cần truy vấn
               k = len(texts_csv)
               retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k":k})
               llm = Cohere(temperature=0)
               compressor_csv = CohereRerank(top_n=3,model="rerank-english-v2.0")
               compression_retriever_csv = ContextualCompressionRetriever(
                   base_compressor=compressor_csv, base_retriever=retriever_csv
               )
               compressed_docs_csv = compression_retriever_csv.get_relevant_documents(f"{query}")
               file_path = compressed_docs_csv[0].metadata['source']
               print(file_path)
               if file_path.endswith('.xlsx'):
                   new = file_path.replace(".xlsx", ".csv")
                   convert_xlsx_to_csv(file_path, new)
                   loader1 = CSVLoader(new) 
               else: 
                   loader1 = CSVLoader(file_path)
               docs1 = loader1.load()
               text = " " # Thêm dòng chữ trước nội dung từ mỗi tệp
               for z in docs1:
                   text += z.page_content + "\n"
               prompt_csv = f"Dựa vào nội dung sau: {text}. Hãy trả lời câu hỏi sau đây: {query}. Bằng tiếng Việt"
               answer_csv = question_answer(prompt_csv)
               answer_all += answer_csv
          
          else:

              file_path = compressed_docs[0].metadata['source']
              if file_path.endswith(".pdf"):
               loader = UnstructuredPDFLoader(file_path)
              elif file_path.endswith('.docx') or file_path.endswith('doc'):
               loader = Docx2txtLoader(file_path)
              elif file_path.endswith('.txt'):
               loader = TextLoader(file_path,encoding="utf8")
              elif file_path.endswith('.pptx'):     
               loader = UnstructuredPowerPointLoader(file_path)
              elif file_path.endswith('.xml'):     
               loader = UnstructuredXMLLoader(file_path)
              elif file_path.endswith('.html'):
               loader = UnstructuredHTMLLoader(file_path)
              elif file_path.endswith('.json'):
                loader = JSONLoader(file_path)
              elif file_path.endswith('.md'):
                loader = UnstructuredMarkdownLoader(file_path)
              elif file_path.endswith('.xlsx'):
               file_path_new = file_path.replace(".xlsx", ".csv")
               convert_xlsx_to_csv(file_path,file_path_new)
               loader = CSVLoader(file_path_new)
              elif file_path.endswith('.csv'):
               loader = CSVLoader(file_path)   
         
              text_splitter = CharacterTextSplitter(chunk_size=3200, chunk_overlap=1500)
              texts= text_splitter.split_documents(loader.load())
          
              vectordb_file = Chroma.from_documents(texts,embeddings)
              k_1 = len(texts)
    
              retriever_file = vectordb_file.as_retriever(search_kwargs={f"k":k_1})

              llm = Cohere(temperature=0)
              compressor_file = CohereRerank(top_n=5,model="rerank-english-v2.0")
              compression_retriever_file = ContextualCompressionRetriever(
        base_compressor=compressor_file, base_retriever=retriever_file
            )
              compressed_docs_file = compression_retriever_file.get_relevant_documents(f"{x}")         
              query = x
              text = ""
              for x1 in compressed_docs_file:
               text += x1.page_content
              prompt = f"Dựa vào nội dung sau:{text}. Hãy trả lời câu hỏi sau đây: {query}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
              answer = question_answer(prompt)
              answer_all += answer
     
     prompt1 = f"Dựa vào nội dung sau:{answer_all}. Hãy trả lời câu hỏi sau đây: {query_all}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
     answer1 = question_answer(prompt1)

     return answer1    
def handle_query_upgrade_keyword_old(query_all,text_all,id):
     answer_all = ""
     os.environ["COHERE_API_KEY"] = "RWIrMiF184xBmOAAvodxCxExefwEAYSn7yEOmbk1"
    #  prompt_history = f"Dựa vào nội dung lịch sử câu hỏi sau:{chat_history}. Hãy trả lời câu hỏi sau đây: {query_all}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
    #  answer_history = "Câu trả lời sau khi xem qua lịch sử chat:" + question_answer(prompt_history)
    #  answer_all += answer_history
     # Xử lý multiquery:
    #  test = query_analyzer.invoke(f"{query_all}")
     test = query_analyzer(query_all)
     test_string = str(test)
     matches = re.findall(r"'([^']*)'", test_string)

     # Xử lý dữ liệu
    #  text_all = extract_data2(id)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=2500)
     texts = text_splitter.split_documents(text_all)
     vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embeddings)
     # Thiết lập số k cần truy vấn
     k = len(texts)
     retriever = vectordb.as_retriever(search_kwargs={"k":k})
     llm = Cohere(temperature=0)
     compressor = CohereRerank(top_n=5,model="rerank-english-v2.0")
     compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
            )
     for x in matches:
        query = x
        keyword,key_words_old = find_matching_files_in_docs_12_id(query,id)
        file_list = keyword
        if file_list:
              list_keywords2 = list(key_words_old)
              contents1 = extract_content_between_keywords(query, list_keywords2)
              merged_result = merge_files(keyword,list_keywords2)
              original_dict = contents1
    # Từ điển replacement
              replacement_dict = merged_result
              new_dict = replace_keys_with_values(original_dict, replacement_dict)

              files_to_remove = [filename for filename in new_dict.keys() if filename.endswith('.xlsx') or filename.endswith('.csv')]
              removed_files = {}

              for filename in files_to_remove:
                  removed_files[filename] = new_dict[filename]

# Xóa các tệp khỏi new_dict
              for filename in files_to_remove:
                  new_dict.pop(filename)
              test_csv = ""
              text_csv,query_csv = aws1_csv_id(removed_files,id)
              prompt_csv = ""
              answer_csv = ""
              if test_csv:
                     prompt_csv = f"Dựa vào nội dung sau: {text_csv}. Hãy trả lời câu hỏi sau đây: {query_csv}.Bằng tiếng Việt"
                     answer_csv = question_answer(prompt_csv)
              
            #   text_document, query_document = aws1_all_id(new_dict,text_all,id)
            # #   answer_document = sf.aws1_all(new_dict,text_alls,id)
            #   prompt_document = f"Dựa vào nội dung sau: {text_document}. Hãy trả lời câu hỏi sau đây: {query_document}. Bằng tiếng Việt"
              answer_document = aws1_all_id(new_dict,text_all,id)

              answer_all1 = answer_document + answer_csv
    
              answer_all += answer_all1

        else:
          compressed_docs = compression_retriever.get_relevant_documents(f"{query}")
          relevance_score_float = float(compressed_docs[0].metadata['relevance_score'])
         
          if  relevance_score_float <= 0.82:
               documents1 = []
               for file in os.listdir(f"/code/temp/{id}"):
                  if file.endswith('.csv'):
                     csv_path = f"/code/temp/{id}/" + file
                     loader = UnstructuredCSVLoader(csv_path)
                     documents1.extend(loader.load())
                  elif file.endswith('.xlsx'):
                     excel_path = f"/code/temp/{id}/" + file
                     loader = UnstructuredExcelLoader(excel_path)
                     documents1.extend(loader.load())
               
               text_splitter_csv = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=2200, chunk_overlap=1500)
               texts_csv= text_splitter_csv.split_documents(documents1)
               vectordb_csv = Chroma.from_documents(documents=texts_csv,
                                 embedding=embeddings,persist_directory=f'./demoaaa')
     # Thiết lập số k cần truy vấn
               k = len(texts_csv)
               retriever_csv = vectordb_csv.as_retriever(search_kwargs={"k":k})
               llm = Cohere(temperature=0)
               compressor_csv = CohereRerank(top_n=3,model="rerank-english-v2.0")
               compression_retriever_csv = ContextualCompressionRetriever(
                   base_compressor=compressor_csv, base_retriever=retriever_csv
               )
               compressed_docs_csv = compression_retriever_csv.get_relevant_documents(f"{query}")
               file_path = compressed_docs_csv[0].metadata['source']
               print(file_path)
               if file_path.endswith('.xlsx'):
                   new = file_path.replace(".xlsx", ".csv")
                   convert_xlsx_to_csv(file_path, new)
                   loader1 = CSVLoader(new) 
               else: 
                   loader1 = CSVLoader(file_path)
               docs1 = loader1.load()
               text = " " # Thêm dòng chữ trước nội dung từ mỗi tệp
               for z in docs1:
                   text += z.page_content + "\n"
               prompt_csv = f"Dựa vào nội dung sau: {text}. Hãy trả lời câu hỏi sau đây: {query}. Bằng tiếng Việt"
               answer_csv = question_answer(prompt_csv)
               answer_all += answer_csv
          
          else:

              file_path = compressed_docs[0].metadata['source']
              if file_path.endswith(".pdf"):
               loader = UnstructuredPDFLoader(file_path)
              elif file_path.endswith('.docx') or file_path.endswith('doc'):
               loader = Docx2txtLoader(file_path)
              elif file_path.endswith('.txt'):
               loader = TextLoader(file_path,encoding="utf8")
              elif file_path.endswith('.pptx'):     
               loader = UnstructuredPowerPointLoader(file_path)
              elif file_path.endswith('.xml'):     
               loader = UnstructuredXMLLoader(file_path)
              elif file_path.endswith('.html'):
               loader = UnstructuredHTMLLoader(file_path)
              elif file_path.endswith('.json'):
                loader = JSONLoader(file_path)
              elif file_path.endswith('.md'):
                loader = UnstructuredMarkdownLoader(file_path)
              elif file_path.endswith('.xlsx'):
               file_path_new = file_path.replace(".xlsx", ".csv")
               convert_xlsx_to_csv(file_path,file_path_new)
               loader = CSVLoader(file_path_new)
              elif file_path.endswith('.csv'):
               loader = CSVLoader(file_path)   
         
              text_splitter = CharacterTextSplitter(chunk_size=3200, chunk_overlap=1500)
              texts= text_splitter.split_documents(loader.load())
          
              vectordb_file = Chroma.from_documents(texts,embeddings)
              k_1 = len(texts)
    
              retriever_file = vectordb_file.as_retriever(search_kwargs={f"k":k_1})

              llm = Cohere(temperature=0)
              compressor_file = CohereRerank(top_n=5,model="rerank-english-v2.0")
              compression_retriever_file = ContextualCompressionRetriever(
        base_compressor=compressor_file, base_retriever=retriever_file
            )
              compressed_docs_file = compression_retriever_file.get_relevant_documents(f"{x}")         
              query = x
              text = ""
              for x1 in compressed_docs_file:
               text += x1.page_content
              prompt = f"Dựa vào nội dung sau:{text}. Hãy trả lời câu hỏi sau đây: {query}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
              answer = question_answer(prompt)
              answer_all += answer
     
     prompt1 = f"Dựa vào nội dung sau:{answer_all}. Hãy trả lời câu hỏi sau đây: {query_all}. Mà không thay đổi, chỉnh sửa nội dung mà mình đã cung cấp"
     answer1 = question_answer(prompt1)

     return answer1