|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging |
|
|
|
import os,torch |
|
from datasets import load_dataset |
|
from langdetect import detect |
|
from langdetect import detect_langs |
|
from langdetect import DetectorFactory |
|
import pandas as pd |
|
import pyarrow as pa |
|
import pyarrow.dataset as ds |
|
from datasets import Dataset |
|
import re |
|
from langchain_community.embeddings import SentenceTransformerEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import json |
|
import pickle |
|
import numpy as np |
|
import shutil |
|
import tempfile |
|
|
|
index_source='index.faiss' |
|
hh_source='index.pkl' |
|
model_name = "sentence-transformers/all-MiniLM-L6-v2" |
|
|
|
|
|
embedding_llm = SentenceTransformerEmbeddings(model_name=model_name) |
|
|
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
|
|
tokenizer1 = T5Tokenizer.from_pretrained("google/flan-t5-base") |
|
model1 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") |
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
index_target = os.path.join(temp_dir, 'index.faiss') |
|
hh_target = os.path.join(temp_dir, 'index.pkl') |
|
|
|
|
|
shutil.copy(index_source, index_target) |
|
shutil.copy(hh_source, hh_target) |
|
|
|
|
|
vector_db = FAISS.load_local(temp_dir, embedding_llm, allow_dangerous_deserialization=True) |
|
|
|
|
|
|
|
def load_data(text_filename='docs_text.json', embeddings_filename='docs_embeddings.json'): |
|
|
|
|
|
|
|
with open(text_filename, 'r', encoding='utf-8') as f: |
|
docs_text = json.load(f) |
|
|
|
|
|
with open(embeddings_filename, 'r') as f: |
|
docs_embeddings = json.load(f) |
|
|
|
return docs_text, docs_embeddings |
|
|
|
def mot_cle(path): |
|
with open(path, 'r') as fichier: |
|
contenu = fichier.read() |
|
|
|
|
|
mots = contenu.split(',') |
|
|
|
|
|
for mot in mots: |
|
print(mot.strip()) |
|
|
|
|
|
tableau_de_mots = [mot.strip() for mot in mots] |
|
return tableau_de_mots |
|
|
|
|
|
|
|
def pip(question,docs_text, docs_embeddings,mots_a_verifier,vector_db): |
|
query_text = question |
|
q1=question |
|
print(q1) |
|
detected_languages=detect_langs(question) |
|
main_language = max(detected_languages, key=lambda lang: lang.prob) |
|
lang = main_language.lang |
|
print(lang) |
|
|
|
|
|
|
|
if lang=='fr': |
|
input_text = f"translate french to English: {query_text}" |
|
input_ids = tokenizer1(input_text, return_tensors="pt").input_ids |
|
|
|
outputs = model1.generate(input_ids,max_length = 100) |
|
print(tokenizer1.decode(outputs[0])) |
|
text=tokenizer1.decode(outputs[0]) |
|
cleaned_text = re.sub(r'<.*?>', '', text) |
|
cleaned_text = cleaned_text.strip() |
|
query_text=cleaned_text |
|
|
|
query_embedding = embedding_llm.embed_query(query_text) |
|
query_embedding_array = np.array(query_embedding) |
|
docs_embeddings=np.array(docs_embeddings) |
|
|
|
|
|
|
|
|
|
question = query_text |
|
print(question) |
|
|
|
mots_question = question.lower().split() |
|
bi_grammes = [' '.join([mots_question[i], mots_question[i+1]]) for i in range(len(mots_question)-1)] |
|
|
|
mots_a_verifier_lower = {mot.lower(): mot for mot in mots_a_verifier} |
|
mots_question_lower=[mot.lower() for mot in mots_question] |
|
bi_grammes_lower=[mot.lower() for mot in bi_grammes] |
|
|
|
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in bi_grammes_lower] |
|
if not mots_trouves1: |
|
mots_trouves1 = [mots_a_verifier_lower[mot] for mot in mots_a_verifier_lower if mot in mots_question_lower ] |
|
|
|
|
|
mots_trouves=mots_trouves1 |
|
if not mots_trouves: |
|
|
|
similarities = [cosine_similarity(doc.reshape(1,-1), query_embedding_array.reshape(1,-1)) for doc in docs_embeddings] |
|
print(similarities) |
|
sorted_docs = sorted(zip(docs_text, docs_embeddings, similarities), key=lambda x: x[2], reverse=True) |
|
similar_docs1 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.72] |
|
if not similar_docs1: |
|
similar_docs2 = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.65] |
|
if not similar_docs2: |
|
similar_docs = [(doc,sim) for doc, _, sim in sorted_docs if sim > 0.4] |
|
if not similar_docs: |
|
similar_docsA = [(doc,sim) for doc, _, sim in sorted_docs if (sim >= 0.3 and sim<0.4)] |
|
if not similar_docsA: |
|
print("As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope.") |
|
prompt=" for this question write this answer and don't add anything :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
if lang=='fr': |
|
prompt="for this question translate this answer in frensh and write theme , don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
else: |
|
print("I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help.") |
|
prompt="for this question write this answer and don't add anything: I apologize, I don't fully understand your question. You can contact our customer service for answers to your needs, or if you can provide more details, I would be happy to help." |
|
if lang=='fr': |
|
prompt="for this question translate this answer in frensh and write theme,don't add anything and don't mention that you translate the answer :As a chatbot for Djezzy, I can provide information exclusively about our affiliated companies. Unfortunately, I'm unable to respond to inquiries outside of that scope." |
|
|
|
|
|
else: |
|
context="\n---------------------\n".join([doc for doc,_ in similar_docs[:4]]if len(similar_docs) >=3 else [doc for doc, _ in similar_docs[:1]]) |
|
print(context) |
|
system_message=" " |
|
prompt = f"As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context ###context:{context}\n ###question: {query_text} " |
|
if lang=='fr': |
|
prompt=f"[INST] <<SYS>>\n As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer \n ###context:{context}<</SYS>>\n\n ###question: {query_text} [/INST]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
context = "\n---------------------\n".join([doc for doc, _ in similar_docs2[:2]] if len(similar_docs2) >= 2 else [doc for doc, _ in similar_docs2[:1]]) |
|
print(context) |
|
system_message=" " |
|
prompt = f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n ###question: {query_text} " |
|
if lang=='fr': |
|
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french and write theme ,don't mention that you translate the answer\n ###context:{context}\n ###question: {query_text}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
context="\n---------------------\n".join([doc for doc,_ in similar_docs1[:1]]) |
|
print(context) |
|
system_message=" " |
|
prompt = f"As Djezzy's chatbot\nread 3 times each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n differentiates between each price and gives the correct answer and does not distinguish between the offers of each price\n ###context:{context}\n {query_text}" |
|
if lang=='fr': |
|
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n translate the answer in french ,don't mention that you translate the answer\n ###context:{context}\n ###question: {q1} " |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
i=0 |
|
similar_docs=[] |
|
for i in range(len(mots_trouves)): |
|
k=mots_trouves[i] |
|
result=vector_db.similarity_search( |
|
query_text, |
|
k=1, |
|
filter={'document':mots_trouves[i] } |
|
) |
|
similar_docs.append(result[0]) |
|
context="\n---------------------\n".join([similar_docs[i].page_content for i in range(len(similar_docs))]) |
|
print(context) |
|
system_message=" " |
|
prompt = f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n ###context:{context}\n ###question: {query_text} " |
|
if lang=='fr': |
|
prompt=f" As Djezzy's chatbot\nread each paraphrase in the context and Answer the question .\ndo not take into consideration the paragraphs which have no relation to the question\n if there is not a paragraph that is related to the question, respond that for this question it's best to reach out to our customer service team . They'll be able to assist you with your needs\n just give me the answer I don't want any other details \n dont' mention that you used the provided context\n give me the answer in french language \n ###context:{context}\n ###question: {q1}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return prompt |
|
|