Spaces:
Build error
Build error
import os | |
import uuid | |
import gradio as gr | |
from dotenv import load_dotenv | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnableLambda, RunnablePassthrough | |
from langchain_core.prompts import PromptTemplate | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_openai import ChatOpenAI | |
from langchain.chains import RetrievalQA | |
from langchain_community.document_loaders import UnstructuredURLLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores.utils import filter_complex_metadata | |
import smtplib | |
from email.mime.text import MIMEText | |
from email.mime.multipart import MIMEMultipart | |
import logging | |
from langchain_community.document_loaders import PyPDFLoader | |
from typing import List | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader | |
from langchain_unstructured import UnstructuredLoader | |
from langchain import hub | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain.vectorstores import Chroma | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
import os | |
import bs4 | |
from sentence_transformers import SentenceTransformer | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
from langchain_huggingface import HuggingFaceEmbeddings | |
import ollama | |
from langchain.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings | |
from langchain_ollama import OllamaEmbeddings | |
import numpy as np | |
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
import chromadb | |
import uuid | |
import os | |
from langchain.embeddings import HuggingFaceEmbeddings | |
load_dotenv() | |
os.environ['LANGCHAIN_TRACING_V2'] = 'true' | |
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com' | |
os.environ['LANGCHAIN_API_KEY'] | |
os.environ["OPENAI_API_KEY"] | |
ef clean_text(text): | |
'''this functionn clean the output of the webmloader ''' | |
text = text.replace('\xa0', ' ') | |
text = re.sub(r'[\n\r\t]+', ' ', text) | |
text = re.sub(r'\s+', ' ', text) | |
return text.strip() | |
chroma_db_path = "./chroma_db" | |
chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
data = chroma_client.get_collection(name="my_dataaaa") | |
file_path = ( | |
"Charte.pdf" | |
) | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document0=pages[0].page_content | |
document0 | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits1 = text_splitter.split_text(document0) | |
splits1 | |
embeddings1 = embeddings_model.embed_documents( | |
splits1 | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids1 = [str(uuid.uuid4()) for _ in range(len(splits1))] | |
data.add( | |
documents=splits1, | |
embeddings=embeddings1, | |
ids=ids1 | |
) | |
file_path = "circulaire 35-2010.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document1=[page.page_content for doc in pages] | |
document1 | |
document1 = "\n".join(document1) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits2 = text_splitter.split_text(document1) | |
splits2 | |
embeddings2 = embeddings_model.embed_documents( | |
splits2, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids2 = [str(uuid.uuid4()) for _ in range(len(splits2))] | |
data.add( | |
documents=splits2, | |
embeddings=embeddings2, | |
ids=ids2 | |
) | |
file_path = "Demande de prolongation de stage MP2 Physique.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document2 = [page.page_content for doc in pages] | |
document2 | |
document2 = "\n".join(document2) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits3 = text_splitter.split_text(document2) | |
splits3 | |
embeddings3 = embeddings_model.embed_documents( | |
splits3, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids3 = [str(uuid.uuid4()) for _ in range(len(splits3))] | |
data.add( | |
documents=splits3, | |
embeddings=embeddings3, | |
ids=ids3 | |
) | |
file_path = "dérogation pdf.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document3=[page.page_content for doc in pages] | |
document3 | |
document3 = "\n".join(document3) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits4 = text_splitter.split_text(document3) | |
splits4 | |
embeddings4 = embeddings_model.embed_documents( | |
splits4, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids4 = [str(uuid.uuid4()) for _ in range(len(splits4))] | |
data.add( | |
documents=splits4, | |
embeddings=embeddings4, | |
ids=ids4 | |
) | |
file_path = "Fiche d'évaluation de stage.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document4=[page.page_content for doc in pages] | |
document4 | |
document4 = "\n".join(document4) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits5 = text_splitter.split_text(document4) | |
splits5 | |
embeddings5 = embeddings_model.embed_documents( | |
splits5, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids5 = [str(uuid.uuid4()) for _ in range(len(splits5))] | |
data.add( | |
documents=splits5, | |
embeddings=embeddings5, | |
ids=ids5 | |
) | |
file_path = "النظام الداخلي لكلية العلوم بالمنستير.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document5=[page.page_content for doc in pages] | |
document5 | |
document5 = "\n".join(document5) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits6 = text_splitter.split_text(document5) | |
splits6 | |
embeddings6 = embeddings_model.embed_documents( | |
splits6, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids6 = [str(uuid.uuid4()) for _ in range(len(splits6))] | |
data.add( | |
documents=splits6, | |
embeddings=embeddings6, | |
ids=ids6 | |
) | |
file_path = "sante_mentale.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document6=[page.page_content for doc in pages] | |
document6 | |
document6 = "\n".join(document6) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits7 = text_splitter.split_text(document6) | |
splits7 | |
embeddings7 = embeddings_model.embed_documents( | |
splits7, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids7 = [str(uuid.uuid4()) for _ in range(len(splits7))] | |
data.add( | |
documents=splits7, | |
embeddings=embeddings7, | |
ids=ids7 | |
) | |
file_path = "sante_mentale2.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
document7=[page.page_content for doc in pages] | |
document7 | |
document7 = "\n".join(document7) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits8 = text_splitter.split_text(document7) | |
splits8 | |
embeddings8 = embeddings_model.embed_documents( | |
splits8, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
ids8 = [str(uuid.uuid4()) for _ in range(len(splits8))] | |
data.add( | |
documents=splits8, | |
embeddings=embeddings8, | |
ids=ids8 | |
) | |
file_path = "score_pour_mastere.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[99]: | |
document8=[page.page_content for doc in pages] | |
# In[100]: | |
document8 | |
# # splitting DOC8 into chunks | |
# In[102]: | |
document8 = "\n".join(document8) | |
# In[103]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits9 = text_splitter.split_text(document8) | |
# In[104]: | |
splits9 | |
# In[105]: | |
embeddings9 = embeddings_model.embed_documents( | |
splits9, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[106]: | |
ids9 = [str(uuid.uuid4()) for _ in range(len(splits9))] | |
# In[107]: | |
data.add( | |
documents=splits9, | |
embeddings=embeddings9, | |
ids=ids9 | |
) | |
# # Master RECHERCHE | |
# # Document 9 Recherche chimie | |
# In[110]: | |
file_path = "recherche_chimie.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[111]: | |
document9=[page.page_content for doc in pages] | |
# In[112]: | |
document9 | |
# # splitting DOC9 into chunks | |
# In[114]: | |
document9= "\n".join(document9) | |
# In[115]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits10 = text_splitter.split_text(document9) | |
# In[116]: | |
splits10 | |
# In[117]: | |
embeddings10 = embeddings_model.embed_documents( | |
splits10, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[118]: | |
ids10 = [str(uuid.uuid4()) for _ in range(len(splits10))] | |
# In[119]: | |
data.add( | |
documents=splits10, | |
embeddings=embeddings10, | |
ids=ids10 | |
) | |
# # Document 10 Recherche info | |
# In[121]: | |
file_path = "recherche_info.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[122]: | |
document10=[page.page_content for doc in pages] | |
# In[123]: | |
document10 | |
# # splitting DOC10 into chunks | |
# In[125]: | |
document10= "\n".join(document10) | |
# In[126]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits11 = text_splitter.split_text(document10) | |
# In[127]: | |
splits11 | |
# In[128]: | |
embeddings11 = embeddings_model.embed_documents( | |
splits11, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[129]: | |
ids11 = [str(uuid.uuid4()) for _ in range(len(splits11))] | |
# In[130]: | |
data.add( | |
documents=splits11, | |
embeddings=embeddings11, | |
ids=ids11 | |
) | |
# # Document 11 Recherche physique | |
# In[132]: | |
file_path = "recherche_phy.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[133]: | |
document11=[page.page_content for doc in pages] | |
# In[134]: | |
document11 | |
# # splitting DOC11 into chunks | |
# In[136]: | |
document11= "\n".join(document11) | |
# In[137]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits12 = text_splitter.split_text(document11) | |
# In[138]: | |
splits12 | |
# In[139]: | |
embeddings12 = embeddings_model.embed_documents( | |
splits12, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[140]: | |
ids12 = [str(uuid.uuid4()) for _ in range(len(splits12))] | |
# In[141]: | |
data.add( | |
documents=splits12, | |
embeddings=embeddings12, | |
ids=ids12 | |
) | |
# # Mastere Pro | |
# # Document 12 PRO chimie | |
# In[144]: | |
file_path = "pro_chimie.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[145]: | |
document12=[page.page_content for doc in pages] | |
# In[146]: | |
document12 | |
# # splitting DOC 12 into chunks | |
# In[148]: | |
document12= "\n".join(document12) | |
# In[149]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits13= text_splitter.split_text(document12) | |
# In[150]: | |
splits13 | |
# In[151]: | |
embeddings13 = embeddings_model.embed_documents( | |
splits13, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[152]: | |
ids13 = [str(uuid.uuid4()) for _ in range(len(splits13))] | |
# In[153]: | |
data.add( | |
documents=splits13, | |
embeddings=embeddings13, | |
ids=ids13 | |
) | |
# # Document 13 PRO info | |
# In[155]: | |
file_path = "pro_info.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[156]: | |
document13=[page.page_content for doc in pages] | |
# In[157]: | |
document13 | |
# # splitting DOC 13 into chunks | |
# In[159]: | |
document13= "\n".join(document13) | |
# In[160]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits14= text_splitter.split_text(document13) | |
# In[161]: | |
splits14 | |
# In[162]: | |
embeddings14 = embeddings_model.embed_documents( | |
splits14, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[163]: | |
ids14 = [str(uuid.uuid4()) for _ in range(len(splits14))] | |
# In[164]: | |
data.add( | |
documents=splits14, | |
embeddings=embeddings14, | |
ids=ids14 | |
) | |
# # Document 14 on peut effectuer deux stages en meme temps | |
# In[166]: | |
file_path = "deux_stage_.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[167]: | |
document14=[page.page_content for doc in pages] | |
# In[168]: | |
document14 | |
# # splitting DOC14 INTO chunks | |
# In[170]: | |
document14= "\n".join(document14) | |
# In[171]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits15= text_splitter.split_text(document14) | |
# In[172]: | |
splits15 | |
# In[173]: | |
embeddings15= embeddings_model.embed_documents( | |
splits15, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[174]: | |
ids15 = [str(uuid.uuid4()) for _ in range(len(splits15))] | |
# In[175]: | |
data.add( | |
documents=splits15, | |
embeddings=embeddings15, | |
ids=ids15 | |
) | |
# # Document 15 des question avec reponse | |
# In[177]: | |
file_path = "Les avantages de la carte étudiante.pdf" | |
loader = PyPDFLoader(file_path) | |
pages = [] | |
async for page in loader.alazy_load(): | |
pages.append(page) | |
# In[178]: | |
document15=[page.page_content for doc in pages] | |
# In[179]: | |
document15 | |
# # Splitting DOC15 into chunks | |
# In[181]: | |
document15= "\n".join(document15) | |
# In[182]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50, separators=["\n\n", "\n", ".", " ", "\n•"]) | |
splits16= text_splitter.split_text(document15) | |
# In[183]: | |
splits16 | |
# In[184]: | |
embeddings16 = embeddings_model.embed_documents( | |
splits16, | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
# In[185]: | |
ids16 = [str(uuid.uuid4()) for _ in range(len(splits16))] | |
# In[186]: | |
data.add( | |
documents=splits16, | |
embeddings=embeddings16, | |
ids=ids16 | |
) | |
# # Checking does the data is added or not ✅ | |
# In[188]: | |
data = data.get(include=['embeddings']) | |
print(data) | |
# embeddings_model = SentenceTransformer("HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
embeddings_model = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
# # Configure `ChromaDB` for our work | |
# In[29]: | |
# chroma_client.delete_collection(name="my_dataaaa") # Deletes "my_dataaaa" | |
# In[30]: | |
chroma_db_path = "./chroma_db" | |
chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
# In[31]: | |
data = chroma_client.get_or_create_collection(name="my_dataaaa") | |
# # <p style="color: orange;">Document 0 Masteres-Procedure-de-Depot</p> | |
# In[33]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/152/Masteres-Procedure-de-Depot",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Masteres_Procedure_de_Depot = loader.load() | |
# In[34]: | |
Masteres_Procedure_de_Depot = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Masteres_Procedure_de_Depot] | |
Masteres_Procedure_de_Depot | |
# ## spliiting into chunks the doc0 | |
# In[36]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100) | |
splits1 = text_splitter.split_documents( Masteres_Procedure_de_Depot) | |
# In[37]: | |
splits1 | |
# ## Saving to chromadb in data | |
# In[39]: | |
contents1 = [doc.page_content for doc in splits1] | |
metadata1 = [doc.metadata for doc in splits1] | |
# In[40]: | |
embeddings1 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits1], | |
#normalize_embeddings=True, | |
#batch_size=256, | |
#show_progress_bar=True | |
) | |
print(embeddings1) | |
# In[41]: | |
ids = [str(uuid.uuid4()) for _ in range(len(contents1))] | |
# In[42]: | |
data.add( | |
documents=contents1, | |
embeddings=embeddings1, | |
metadatas=metadata1, | |
ids=ids | |
) | |
# In[43]: | |
# visulizing in a dataframe | |
data_dict = { | |
"ID": ids, | |
"Document": contents1, | |
"Metadata": metadata1, | |
"Embedding Shape": [np.array(embed).shape for embed in embeddings1], | |
} | |
df = pd.DataFrame(data_dict) | |
df.tail() | |
# In[44]: | |
def append_data(contents, metadata, embeddings): | |
'''this function will append the embeddings and metadata and | |
the document into the data_dict so we can visulize how it looks in chrom ''' | |
global df | |
new_ids = list(range(len(df) + 1, len(df) + 1 + len(contents))) | |
data_dict["ID"].extend(new_ids) | |
data_dict["Document"].extend(contents) | |
data_dict["Metadata"].extend(metadata) | |
data_dict["Embedding Shape"].extend([np.array(embed).shape for embed in embeddings]) | |
df = pd.DataFrame(data_dict) | |
# # <p style="color: orange;">Document 1 Theses-Inscriptions-etProcedure-de-Depot</p> | |
# In[46]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/147/Theses-Inscriptions-etProcedure-de-Depot",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Theses_Inscriptions_etProcedure_de_Depot = loader.load() | |
# In[47]: | |
Theses_Inscriptions_etProcedure_de_Depot = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Theses_Inscriptions_etProcedure_de_Depot] | |
Theses_Inscriptions_etProcedure_de_Depot | |
# ## splitting into chunks the doc1 | |
# In[49]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits2 = text_splitter.split_documents( Theses_Inscriptions_etProcedure_de_Depot) | |
# In[50]: | |
splits2 | |
# In[51]: | |
contents2= [doc.page_content for doc in splits2] | |
metadata2 = [doc.metadata for doc in splits2] | |
# In[52]: | |
embeddings2 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits2], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings2) | |
# In[53]: | |
ids2= [str(uuid.uuid4()) for _ in range(len(contents2))] | |
# In[54]: | |
data.add( | |
documents=contents2, | |
embeddings=embeddings2, | |
metadatas=metadata2, | |
ids=ids2 | |
) | |
# In[55]: | |
append_data(contents2, metadata2, embeddings2) | |
# In[56]: | |
df | |
# # <p style="color: orange;"> Document 2 رشة_بعنوان_أهمية_الصحة_النفسية</p> | |
# In[58]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4798/%D9%88%D8%B1%D8%B4%D8%A9-%D8%A8%D8%B9%D9%86%D9%88%D8%A7%D9%86-%D8%A3%D9%87%D9%85%D9%8A%D8%A9-%D8%A7%D9%84%D8%B5%D8%AD%D8%A9-%D8%A7%D9%84%D9%86%D9%81%D8%B3%D9%8A%D8%A9",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
warcha_mental_health = loader.load() | |
# In[59]: | |
warcha_mental_health = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in warcha_mental_health] | |
warcha_mental_health | |
# ## spitting doc 2 into chunks | |
# In[61]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits3 = text_splitter.split_documents( warcha_mental_health) | |
# In[62]: | |
splits3 | |
# In[63]: | |
contents3= [doc.page_content for doc in splits3] | |
metadata3 = [doc.metadata for doc in splits3] | |
# In[64]: | |
embeddings3 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits3], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings3) | |
# In[65]: | |
ids3 = [str(uuid.uuid4()) for _ in range(len(contents3))] | |
# In[66]: | |
data.add( | |
documents=contents3, | |
embeddings=embeddings3, | |
metadatas=metadata3, | |
ids=ids3 | |
) | |
# In[67]: | |
append_data(contents3, metadata3, embeddings3) | |
# In[68]: | |
df.tail() | |
# # <p style="color: orange;"> Document 3 festival-de-la-creativite-estudiantine</p> | |
# In[70]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4795/festival-de-la-creativite-estudiantine",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
festival_de_la_creativite_estudiantinet = loader.load() | |
# In[71]: | |
festival_de_la_creativite_estudiantinet = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in festival_de_la_creativite_estudiantinet] | |
festival_de_la_creativite_estudiantinet | |
# ## splitting the Doc3 into chunks | |
# In[73]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits4 = text_splitter.split_documents( festival_de_la_creativite_estudiantinet) | |
# In[74]: | |
print(splits4[0].page_content) # First chunk's content | |
print(splits4[0].metadata) | |
# In[75]: | |
contents4= [doc.page_content for doc in splits4] | |
metadata4 = [doc.metadata for doc in splits4] | |
# In[76]: | |
embeddings4 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits4], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings4) | |
# In[77]: | |
ids4 = [str(uuid.uuid4()) for _ in range(len(contents4))] | |
# In[78]: | |
data.add( | |
documents=contents4, | |
embeddings=embeddings4, | |
metadatas=metadata4, | |
ids=ids4 | |
) | |
# In[79]: | |
append_data(contents4, metadata4, embeddings4) | |
# In[80]: | |
df | |
# # <p style="color: orange;"> Document 4 bourses-d-alternance-2025</p> | |
# In[82]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4813/bourses-d-alternance-2025",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Bourse_alternance = loader.load() | |
# In[83]: | |
Bourse_alternance = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Bourse_alternance] | |
Bourse_alternance | |
# ## splitting doc 4 into chunks | |
# In[85]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits5 = text_splitter.split_documents( Bourse_alternance) | |
# In[86]: | |
print(splits5[2].page_content) | |
print(splits5[2].metadata) | |
# In[87]: | |
contents5= [doc.page_content for doc in splits5] | |
metadata5 = [doc.metadata for doc in splits5] | |
# In[88]: | |
embeddings5 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits5], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings5) | |
# In[89]: | |
ids5 = [str(uuid.uuid4()) for _ in range(len(contents5))] | |
# In[90]: | |
data.add( | |
documents=contents5, | |
embeddings=embeddings5, | |
metadatas=metadata5, | |
ids=ids5 | |
) | |
# In[91]: | |
append_data(contents5, metadata5, embeddings5) | |
# In[92]: | |
df | |
# # <p style="color: orange;"> Document 5 the-indian-council-for-cultural-relations--iccr</p> | |
# In[94]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4807/the-indian-council-for-cultural-relations--iccr-",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
the_indian_council_for_cultural_relations = loader.load() | |
# In[95]: | |
the_indian_council_for_cultural_relations = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in the_indian_council_for_cultural_relations] | |
the_indian_council_for_cultural_relations | |
# ## splitting doc 5 into chunks | |
# In[97]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits6 = text_splitter.split_documents( the_indian_council_for_cultural_relations) | |
# In[98]: | |
splits6 | |
# In[99]: | |
contents6= [doc.page_content for doc in splits6] | |
metadata6 = [doc.metadata for doc in splits6] | |
# In[100]: | |
embeddings6 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits6], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings6) | |
# In[101]: | |
ids6 = [str(uuid.uuid4()) for _ in range(len(contents6))] | |
# In[102]: | |
data.add( | |
documents=contents6, | |
embeddings=embeddings6, | |
metadatas=metadata6, | |
ids=ids6 | |
) | |
# In[103]: | |
append_data(contents6, metadata6, embeddings6) | |
# In[104]: | |
df | |
# In[105]: | |
# page_url = "https://fsm.rnu.tn/useruploads/files/au2425/NV%20ICCR.pdf" | |
# loader = PyPDFLoader(page_url) | |
# applications_guidelines_indian = [] | |
# async for doc in loader.alazy_load(): | |
# applications_guidelines_indian.append(doc) | |
# In[106]: | |
# applications_guidelines_indian | |
# In[107]: | |
# documents6 | |
# In[108]: | |
# pip install "unstructured[pdf]" | |
# # <p style="color: orange;"> Document 6 Règlement intérieur des examens</p> | |
# In[110]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/346/R%C3%A8glement-int%C3%A9rieur-des-examens",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Règlement_intérieur_des_examens = loader.load() | |
# In[111]: | |
Règlement_intérieur_des_examens = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Règlement_intérieur_des_examens] | |
Règlement_intérieur_des_examens | |
# ## splitting doc 6 into chunks | |
# In[113]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits7 = text_splitter.split_documents( Règlement_intérieur_des_examens) | |
# In[114]: | |
splits7 | |
# In[115]: | |
contents7= [doc.page_content for doc in splits7] | |
metadata7 = [doc.metadata for doc in splits7] | |
# In[116]: | |
embeddings7 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits7], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings7) | |
# In[117]: | |
ids7 = [str(uuid.uuid4()) for _ in range(len(contents7))] | |
# In[118]: | |
data.add( | |
documents=contents7, | |
embeddings=embeddings7, | |
metadatas=metadata7, | |
ids=ids7 | |
) | |
# In[119]: | |
append_data(contents7, metadata7, embeddings7) | |
# In[120]: | |
df | |
# # <p style="color: orange;">Document 7 Gestion des Stages & PFE (CPE-BR-01-00)</p> | |
# In[122]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/73/Stages-&-PFE",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Stages_PFE = loader.load() | |
# In[123]: | |
Stages_PFE = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Stages_PFE] | |
Stages_PFE | |
# ## splitting doc 7 into chunks | |
# In[125]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits8 = text_splitter.split_documents( Stages_PFE) | |
# In[126]: | |
splits8 | |
# In[127]: | |
contents8= [doc.page_content for doc in splits8] | |
metadata8 = [doc.metadata for doc in splits8] | |
# In[128]: | |
embeddings8= embeddings_model.embed_documents( | |
[doc.page_content for doc in splits8], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings8) | |
# In[129]: | |
ids8 = [str(uuid.uuid4()) for _ in range(len(contents8))] | |
# In[130]: | |
data.add( | |
documents=contents8, | |
embeddings=embeddings8, | |
metadatas=metadata8, | |
ids=ids8 | |
) | |
# In[131]: | |
append_data(contents8, metadata8, embeddings8) | |
# In[132]: | |
df | |
# # <p style="color: orange;">Document 8 Procédure de déroulement des stages facultatifs (CPE-IN-01-00)</p> | |
# In[134]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/437/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages-facultatif",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Procédure_de_déroulement_des_stages_facultatifs = loader.load() | |
# In[135]: | |
Procédure_de_déroulement_des_stages_facultatifs = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Procédure_de_déroulement_des_stages_facultatifs] | |
Procédure_de_déroulement_des_stages_facultatifs | |
# ## splitting doc 8 into chunks | |
# In[137]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits9 = text_splitter.split_documents( Procédure_de_déroulement_des_stages_facultatifs) | |
# In[138]: | |
splits9 | |
# In[139]: | |
contents9= [doc.page_content for doc in splits9] | |
metadata9 = [doc.metadata for doc in splits9] | |
# In[140]: | |
embeddings9 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits9], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings9) | |
# In[141]: | |
ids9 = [str(uuid.uuid4()) for _ in range(len(contents9))] | |
# In[142]: | |
data.add( | |
documents=contents9, | |
embeddings=embeddings9, | |
metadatas=metadata9, | |
ids=ids9 | |
) | |
# In[143]: | |
append_data(contents9, metadata9, embeddings9) | |
# In[144]: | |
df | |
# # <p style="color: orange;"> Document 9 Procédure de déroulement des stages obligatoires (CPE-IN-02-00)</p> | |
# In[146]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/75/Proc%C3%A9dure-de-d%C3%A9roulement-des-stages",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Procédure_de_déroulement_des_stages_obligatoires = loader.load() | |
# In[147]: | |
Procédure_de_déroulement_des_stages_obligatoires = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Procédure_de_déroulement_des_stages_obligatoires] | |
Procédure_de_déroulement_des_stages_obligatoires | |
# ## splitting doc 9 into chunks | |
# In[149]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits10= text_splitter.split_documents(Procédure_de_déroulement_des_stages_obligatoires) | |
# In[150]: | |
splits10 | |
# In[151]: | |
contents10= [doc.page_content for doc in splits10] | |
metadata10 = [doc.metadata for doc in splits10] | |
# In[152]: | |
embeddings10 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits10], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings10) | |
# In[153]: | |
ids10 = [str(uuid.uuid4()) for _ in range(len(contents10))] | |
# In[154]: | |
data.add( | |
documents=contents10, | |
embeddings=embeddings10, | |
metadatas=metadata10, | |
ids=ids10 | |
) | |
# In[155]: | |
append_data(contents10, metadata10, embeddings10) | |
# In[156]: | |
df | |
# # <p style="color: orange;"> Document 10 Partenariat international</p> | |
# In[158]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/9/Partenariat-international",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Partenariat_international = loader.load() | |
# In[159]: | |
Partenariat_international = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Partenariat_international] | |
Partenariat_international | |
# ## splitting doc 10 into chunks | |
# In[161]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits11 = text_splitter.split_documents(Partenariat_international) | |
# In[162]: | |
splits11 | |
# In[163]: | |
contents11= [doc.page_content for doc in splits11] | |
metadata11 = [doc.metadata for doc in splits11] | |
# In[164]: | |
embeddings11 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits11], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings11) | |
# In[165]: | |
ids11 = [str(uuid.uuid4()) for _ in range(len(contents11))] | |
# In[166]: | |
data.add( | |
documents=contents11, | |
embeddings=embeddings11, | |
metadatas=metadata11, | |
ids=ids11 | |
) | |
# In[167]: | |
append_data(contents11, metadata11, embeddings11) | |
# In[168]: | |
df | |
# # <p style="color: orange;"> Document 11 Communication</p> | |
# In[170]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/140/Communication",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Communication = loader.load() | |
# In[171]: | |
Communication = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Communication] | |
Communication | |
# ## splitting doc 11 into chunks | |
# In[173]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits12 = text_splitter.split_documents(Communication) | |
# In[174]: | |
splits12 | |
# In[175]: | |
contents12= [doc.page_content for doc in splits12] | |
metadata12 = [doc.metadata for doc in splits12] | |
# In[176]: | |
embeddings12 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits12], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings12) | |
# In[177]: | |
ids12 = [str(uuid.uuid4()) for _ in range(len(contents12))] | |
# In[178]: | |
data.add( | |
documents=contents12, | |
embeddings=embeddings12, | |
metadatas=metadata12, | |
ids=ids12 | |
) | |
# In[179]: | |
append_data(contents12, metadata12, embeddings12) | |
# In[180]: | |
df | |
# # <p style="color: orange;"> Document 12 Liens utiles</p> | |
# In[182]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/links",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("links_container","link_item","link_tags") | |
) | |
), | |
) | |
Liens_utiles = loader.load() | |
# In[183]: | |
Liens_utiles = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Liens_utiles] | |
Liens_utiles | |
# ## splitting doc 12 into chunks | |
# In[185]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits13 = text_splitter.split_documents(Liens_utiles) | |
# In[186]: | |
splits13 | |
# In[187]: | |
contents13= [doc.page_content for doc in splits13] | |
metadata13 = [doc.metadata for doc in splits13] | |
# In[188]: | |
embeddings13 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits13], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings13) | |
# In[189]: | |
ids13 = [str(uuid.uuid4()) for _ in range(len(contents13))] | |
# In[190]: | |
data.add( | |
documents=contents13, | |
embeddings=embeddings13, | |
metadatas=metadata13, | |
ids=ids13 | |
) | |
# In[191]: | |
append_data(contents13, metadata13, embeddings13) | |
# In[192]: | |
df | |
# # <p style="color: orange;"> Document 13 Departement Chimie </p> | |
# In[194]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/departements/CH/4/chimie",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Chimie = loader.load() | |
# In[195]: | |
Chimie = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Chimie] | |
Chimie | |
# ## splitting doc 13 into chunks | |
# In[197]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits14 = text_splitter.split_documents(Chimie) | |
# In[198]: | |
splits14 | |
# In[199]: | |
contents14= [doc.page_content for doc in splits14] | |
metadata14 = [doc.metadata for doc in splits14] | |
# In[200]: | |
embeddings14 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits14], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings14) | |
# In[201]: | |
ids14 = [str(uuid.uuid4()) for _ in range(len(contents14))] | |
# In[202]: | |
data.add( | |
documents=contents14, | |
embeddings=embeddings14, | |
metadatas=metadata14, | |
ids=ids14 | |
) | |
# In[203]: | |
append_data(contents14, metadata14, embeddings14) | |
# In[204]: | |
df | |
# # <p style="color: orange;"> Document 14 Departement Mathematique </p> | |
# In[206]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/departements/M/1/mathematiques",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("selectEnsFilter") | |
) | |
), | |
) | |
math = loader.load() | |
# In[207]: | |
math = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in math] | |
math | |
# ## splitting doc 14 into chunks | |
# In[209]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits15 = text_splitter.split_documents(math) | |
# In[210]: | |
splits15 | |
# In[211]: | |
contents15= [doc.page_content for doc in splits15] | |
metadata15 = [doc.metadata for doc in splits15] | |
# In[212]: | |
embeddings15 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits15], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings15) | |
# In[213]: | |
ids15 = [str(uuid.uuid4()) for _ in range(len(contents15))] | |
# In[214]: | |
data.add( | |
documents=contents15, | |
embeddings=embeddings15, | |
metadatas=metadata15, | |
ids=ids15 | |
) | |
# In[215]: | |
append_data(contents15, metadata15, embeddings15) | |
# In[216]: | |
df | |
# # <p style="color: orange;"> Document 15 Departement informatique </p> | |
# In[218]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/departements/Info/2/informatique",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("selectEnsFilter") | |
) | |
), | |
) | |
info = loader.load() | |
# In[219]: | |
info = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in info] | |
info | |
# ## splitting doc 15 into chunks | |
# In[221]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits16=text_splitter.split_documents(info) | |
# In[222]: | |
splits16 | |
# In[223]: | |
contents16= [doc.page_content for doc in splits16] | |
metadata16 = [doc.metadata for doc in splits16] | |
# In[224]: | |
embeddings16 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits16], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings16) | |
# In[225]: | |
ids16 = [str(uuid.uuid4()) for _ in range(len(contents16))] | |
# In[226]: | |
data.add( | |
documents=contents16, | |
embeddings=embeddings16, | |
metadatas=metadata16, | |
ids=ids16 | |
) | |
# In[227]: | |
append_data(contents16, metadata16, embeddings16) | |
# In[228]: | |
df | |
# # <p style="color: orange;">Document 16 departement Physique </p> | |
# # Document 16 Departement 16 | |
# In[231]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/departements/PH/3/physique",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("selectEnsFilter") | |
) | |
), | |
) | |
physique = loader.load() | |
# In[232]: | |
physique = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in physique] | |
physique | |
# ## splitting doc 16 into chunks | |
# In[234]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits17 = text_splitter.split_documents(physique) | |
# In[235]: | |
splits17 | |
# In[236]: | |
contents17= [doc.page_content for doc in splits17] | |
metadata17 = [doc.metadata for doc in splits17] | |
# In[237]: | |
embeddings17 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits17], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings17) | |
# In[238]: | |
ids17 = [str(uuid.uuid4()) for _ in range(len(contents17))] | |
# In[239]: | |
data.add( | |
documents=contents17, | |
embeddings=embeddings17, | |
metadatas=metadata17, | |
ids=ids17 | |
) | |
# In[240]: | |
append_data(contents17, metadata17, embeddings17) | |
# In[241]: | |
df | |
# # <p style="color: orange;">Document 17 Enseignement Tronc Commun </p> | |
# In[243]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/departements/ET/5/enseignement-tronc-commun",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Enseignement_Tronc_Commun = loader.load() | |
# In[244]: | |
Enseignement_Tronc_Commun = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Enseignement_Tronc_Commun] | |
Enseignement_Tronc_Commun | |
# ## splitting doc 17 into chunks | |
# In[246]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits18 = text_splitter.split_documents(Enseignement_Tronc_Commun) | |
# In[247]: | |
splits18 | |
# In[248]: | |
contents18= [doc.page_content for doc in splits18] | |
metadata18 = [doc.metadata for doc in splits18] | |
# In[249]: | |
embeddings18 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits18], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings18) | |
# In[250]: | |
ids18 = [str(uuid.uuid4()) for _ in range(len(contents18))] | |
# In[251]: | |
data.add( | |
documents=contents18, | |
embeddings=embeddings18, | |
metadatas=metadata18, | |
ids=ids18 | |
) | |
# In[252]: | |
append_data(contents18, metadata18, embeddings18) | |
# In[253]: | |
df | |
# # <p style="color: orange;">Document 18 اخر بلاغ للتسجيل بالنسبة للسنة الجامعية </p> | |
# | |
# In[255]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4712/%D8%A7%D8%AE%D8%B1-%D8%A8%D9%84%D8%A7%D8%BA-%D9%84%D9%84%D8%AA%D8%B3%D8%AC%D9%8A%D9%84-%D8%A8%D8%A7%D9%84%D9%86%D8%B3%D8%A8%D8%A9-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
ekher_balegh = loader.load() | |
# In[256]: | |
ekher_balegh = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in ekher_balegh] | |
ekher_balegh | |
# ## splitting doc 18 into chunks | |
# In[258]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits19 = text_splitter.split_documents(ekher_balegh) | |
# In[259]: | |
splits19 | |
# In[260]: | |
contents19= [doc.page_content for doc in splits19] | |
metadata19 = [doc.metadata for doc in splits19] | |
# In[261]: | |
embeddings19 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits19], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings19) | |
# In[262]: | |
ids19 = [str(uuid.uuid4()) for _ in range(len(contents19))] | |
# In[263]: | |
data.add( | |
documents=contents19, | |
embeddings=embeddings19, | |
metadatas=metadata19, | |
ids=ids19 | |
) | |
# In[264]: | |
append_data(contents19, metadata19, embeddings19) | |
# In[265]: | |
df | |
# # <p style="color: orange;">Documents 19 Comptes extranet des étudiants 2024-2025 </p> | |
# | |
# In[267]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4673/comptes-extranet-des-etudiants-2024-2025",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
comptes_extranet_des_etudiants = loader.load() | |
# In[268]: | |
comptes_extranet_des_etudiants = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in comptes_extranet_des_etudiants] | |
comptes_extranet_des_etudiants | |
# ## splitting doc 19 into chunks | |
# In[270]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits20 = text_splitter.split_documents(comptes_extranet_des_etudiants) | |
# In[271]: | |
splits20 | |
# In[272]: | |
contents20= [doc.page_content for doc in splits20] | |
metadata20 = [doc.metadata for doc in splits20] | |
# In[273]: | |
embeddings20 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits20], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings20) | |
# In[274]: | |
ids20 = [str(uuid.uuid4()) for _ in range(len(contents20))] | |
# In[275]: | |
data.add( | |
documents=contents20, | |
embeddings=embeddings20, | |
metadatas=metadata20, | |
ids=ids20 | |
) | |
# In[276]: | |
append_data(contents20, metadata20, embeddings20) | |
# In[277]: | |
df | |
# # <p style="color: orange;"> Document 20 بلاغ الترسيم للسنة الجامعية </p> | |
# | |
# In[279]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/4395/%D8%A8%D9%84%D8%A7%D8%BA-%D8%A7%D9%84%D8%AA%D8%B1%D8%B3%D9%8A%D9%85-%D9%84%D9%84%D8%B3%D9%86%D8%A9-%D8%A7%D9%84%D8%AC%D8%A7%D9%85%D8%B9%D9%8A%D8%A9-2024-2025",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
balegh_tarsim = loader.load() | |
# In[280]: | |
comptes_extranet_des_etudiants = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in balegh_tarsim] | |
balegh_tarsim | |
# ## splitting doc 20 into chunks | |
# In[282]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits21 = text_splitter.split_documents(balegh_tarsim) | |
# In[283]: | |
splits21 | |
# In[284]: | |
contents21= [doc.page_content for doc in splits21] | |
metadata21= [doc.metadata for doc in splits21] | |
# In[285]: | |
embeddings21= embeddings_model.embed_documents( | |
[doc.page_content for doc in splits21], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings21) | |
# In[286]: | |
ids21 = [str(uuid.uuid4()) for _ in range(len(contents21))] | |
# In[287]: | |
data.add( | |
documents=contents21, | |
embeddings=embeddings21, | |
metadatas=metadata21, | |
ids=ids21 | |
) | |
# In[288]: | |
append_data(contents21, metadata21, embeddings21) | |
# In[289]: | |
df | |
# # <p style="color: orange;">Document 21 Fiche de renseignements des diplômés </p> | |
# | |
# In[291]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/138/Fiche-de-renseignements-des-dipl%C3%B4m%C3%A9s",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
Fiche_de_renseignements_des_diplome = loader.load() | |
# In[292]: | |
Fiche_de_renseignements_des_diplome = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in Fiche_de_renseignements_des_diplome] | |
Fiche_de_renseignements_des_diplome | |
# ## splitting doc 21 into chunks | |
# In[294]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits22 = text_splitter.split_documents(Fiche_de_renseignements_des_diplome) | |
# In[295]: | |
splits22 | |
# In[296]: | |
contents22= [doc.page_content for doc in splits22] | |
metadata22 = [doc.metadata for doc in splits22] | |
# In[297]: | |
embeddings22 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits22], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings22) | |
# In[298]: | |
ids22 = [str(uuid.uuid4()) for _ in range(len(contents22))] | |
# In[299]: | |
data.add( | |
documents=contents22, | |
embeddings=embeddings22, | |
metadatas=metadata22, | |
ids=ids22 | |
) | |
# In[300]: | |
append_data(contents22, metadata22, embeddings22) | |
# In[301]: | |
df | |
# # <p style="color: orange;">Document 22 Loi de creation FSM </p> | |
# | |
# In[303]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/1/Loi-de-cr%C3%A9ation",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
loi_de_creation = loader.load() | |
# In[304]: | |
loi_de_creation = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in loi_de_creation] | |
loi_de_creation | |
# ## splitting doc 22 into chunks | |
# In[306]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits23 = text_splitter.split_documents(loi_de_creation) | |
# In[307]: | |
splits23 | |
# In[308]: | |
contents23= [doc.page_content for doc in splits23] | |
metadata23 = [doc.metadata for doc in splits23] | |
# In[309]: | |
embeddings23 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits23], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings23) | |
# In[310]: | |
ids23 = [str(uuid.uuid4()) for _ in range(len(contents23))] | |
# In[311]: | |
data.add( | |
documents=contents23, | |
embeddings=embeddings23, | |
metadatas=metadata23, | |
ids=ids23 | |
) | |
# In[312]: | |
append_data(contents23, metadata23, embeddings23) | |
# In[313]: | |
df | |
# # <p style="color: orange;">Document 23 loi en chiffre </p> | |
# | |
# In[315]: | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/pages/3/En-chiffres",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
loi_en_chiffre = loader.load() | |
# In[316]: | |
loi_en_chiffre = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in loi_en_chiffre] | |
loi_en_chiffre | |
# ## splitting doc 23 into chunks | |
# In[318]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits24 = text_splitter.split_documents(loi_en_chiffre) | |
# In[319]: | |
splits24 | |
# In[320]: | |
contents24= [doc.page_content for doc in splits24] | |
metadata24 = [doc.metadata for doc in splits24] | |
# In[321]: | |
embeddings24 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits24], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings24) | |
# In[322]: | |
ids24 = [str(uuid.uuid4()) for _ in range(len(contents24))] | |
# In[323]: | |
data.add( | |
documents=contents24, | |
embeddings=embeddings24, | |
metadatas=metadata24, | |
ids=ids24 | |
) | |
# In[324]: | |
append_data(contents24, metadata24, embeddings24) | |
# In[325]: | |
df | |
# # LICENCE | |
# # <p style="color: orange;">Document 24 PARCOURS LMD Mathématiques Appliquées</p> | |
# | |
# In[328]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=ABhRHFxzAmNUZVIoBj4ENQYgX2sBPA==&etab=VjJQYQk7",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_math_appli = loader.load() | |
# In[329]: | |
parcours_math_appli = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_math_appli] | |
parcours_math_appli | |
# ## splitting doc 24 into chunks | |
# In[331]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits25 = text_splitter.split_documents(parcours_math_appli) | |
# In[332]: | |
splits25 | |
# In[333]: | |
contents25= [doc.page_content for doc in splits25] | |
metadata25 = [doc.metadata for doc in splits25] | |
# In[334]: | |
embeddings25 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits25], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings25) | |
# In[335]: | |
ids25 = [str(uuid.uuid4()) for _ in range(len(contents25))] | |
# In[336]: | |
data.add( | |
documents=contents25, | |
embeddings=embeddings25, | |
metadatas=metadata25, | |
ids=ids25 | |
) | |
# In[337]: | |
append_data(contents25, metadata25, embeddings25) | |
# In[338]: | |
df | |
# # <p style="color: orange;"> Document 25 parcours lmd Computer Science</p> | |
# | |
# In[340]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=UkpTHlxzUzJXZlctDjJTYFZwDDI=&etab=VjJZaAg6",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_computer_science = loader.load() | |
# In[341]: | |
parcours_computer_science = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_computer_science] | |
parcours_computer_science | |
# ## splitting doc 25 into chunks | |
# In[343]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits26 = text_splitter.split_documents(parcours_computer_science) | |
# In[344]: | |
splits26 | |
# In[345]: | |
contents26= [doc.page_content for doc in splits26] | |
metadata26= [doc.metadata for doc in splits26] | |
# In[346]: | |
embeddings26 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits26], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings26) | |
# In[347]: | |
ids26 = [str(uuid.uuid4()) for _ in range(len(contents26))] | |
# In[348]: | |
data.add( | |
documents=contents26, | |
embeddings=embeddings26, | |
metadatas=metadata26, | |
ids=ids26 | |
) | |
# In[349]: | |
append_data(contents26, metadata26, embeddings26) | |
# In[350]: | |
df | |
# # <p style="color: orange;"> Document 26 Parcours LMD Mesures et Instrumentation</p> | |
# | |
# In[352]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NXGlp1UjNWZwN5BzkHMVN1DzsBPA==&etab=BGBYaQw+",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_Mesures = loader.load() | |
# In[353]: | |
parcours_Mesures = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_Mesures] | |
parcours_Mesures | |
# ## spitting doc 26 inti chunks | |
# In[355]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits27 = text_splitter.split_documents(parcours_Mesures) | |
# In[356]: | |
splits27 | |
# In[357]: | |
contents27= [doc.page_content for doc in splits27] | |
metadata27= [doc.metadata for doc in splits27] | |
# In[358]: | |
embeddings27 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits27], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings27) | |
# In[359]: | |
ids27 = [str(uuid.uuid4()) for _ in range(len(contents27))] | |
# In[360]: | |
data.add( | |
documents=contents27, | |
embeddings=embeddings27, | |
metadatas=metadata27, | |
ids=ids27 | |
) | |
# In[361]: | |
append_data(contents27, metadata27, embeddings27) | |
# In[362]: | |
df | |
# # <p style="color: orange;">Document 27 Parcours LMD Physique </p> | |
# | |
# In[364]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NZFFp1UjNcbVshDjAENlJ0X2tTbg==&etab=AWUDMl9t",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_physique = loader.load() | |
# In[365]: | |
parcours_physique = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_physique] | |
parcours_physique | |
# ## splitting doc 27 into chunks | |
# In[367]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits28 = text_splitter.split_documents(parcours_physique) | |
# In[368]: | |
splits28 | |
# In[369]: | |
contents28= [doc.page_content for doc in splits28] | |
metadata28= [doc.metadata for doc in splits28] | |
# In[370]: | |
embeddings28 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits28], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings28) | |
# In[371]: | |
ids28 = [str(uuid.uuid4()) for _ in range(len(contents28))] | |
# In[372]: | |
data.add( | |
documents=contents28, | |
embeddings=embeddings28, | |
metadatas=metadata28, | |
ids=ids28 | |
) | |
# In[373]: | |
append_data(contents28, metadata28, embeddings28) | |
# In[374]: | |
df | |
# # <p style="color: orange;">Document 28 Parcours LMD chimie </p> | |
# | |
# In[376]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=W0NYFV9wVDVcbQF7BzkKPQQiCz8HOg==&etab=B2NUZQAy",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_chimie = loader.load() | |
# In[377]: | |
parcours_chimie = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_chimie] | |
parcours_chimie | |
# ## splitting doc 28 into chunks | |
# In[379]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits29= text_splitter.split_documents(parcours_chimie) | |
# In[380]: | |
splits29 | |
# In[381]: | |
contents29= [doc.page_content for doc in splits29] | |
metadata29= [doc.metadata for doc in splits29] | |
# In[382]: | |
embeddings29 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits29], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings29) | |
# In[383]: | |
ids29 = [str(uuid.uuid4()) for _ in range(len(contents29))] | |
# In[384]: | |
data.add( | |
documents=contents29, | |
embeddings=embeddings29, | |
metadatas=metadata29, | |
ids=ids29 | |
) | |
# In[385]: | |
append_data(contents29, metadata29, embeddings29) | |
# In[386]: | |
df | |
# # <p style="color: orange;"> Document 29 Parcours LMD Physique-Chimie</p> | |
# | |
# In[388]: | |
loader = WebBaseLoader( | |
web_paths=("http://www.parcours-lmd.salima.tn/listeueetab.php?parc=Bh4HSlh3VTQGN1ctVWsAMVJ0DjA=&etab=VjJZaA0/",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("center") | |
) | |
), | |
) | |
parcours_physique_chimie = loader.load() | |
# In[389]: | |
parcours_physique_chimie = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in parcours_physique_chimie] | |
parcours_physique_chimie | |
# ## splitting doc 29 into chunks | |
# In[391]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits30= text_splitter.split_documents(parcours_physique_chimie) | |
# In[392]: | |
splits30 | |
# In[393]: | |
contents30= [doc.page_content for doc in splits30] | |
metadata30= [doc.metadata for doc in splits30] | |
# In[394]: | |
embeddings30 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits30], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings30) | |
# In[395]: | |
ids30 = [str(uuid.uuid4()) for _ in range(len(contents30))] | |
# In[396]: | |
data.add( | |
documents=contents30, | |
embeddings=embeddings30, | |
metadatas=metadata30, | |
ids=ids30 | |
) | |
# In[397]: | |
append_data(contents30, metadata30, embeddings30) | |
df | |
loader = WebBaseLoader( | |
web_paths=("https://fsm.rnu.tn/fra/articles/1249/demande-de-diplomes",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("content") | |
) | |
), | |
) | |
doc_demande_de_diplome = loader.load() | |
# In[401]: | |
doc_demande_de_diplome = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in doc_demande_de_diplome] | |
doc_demande_de_diplome | |
# ## splitting doc 30 into chunks | |
# In[403]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits31 = text_splitter.split_documents(doc_demande_de_diplome) | |
# In[404]: | |
splits31 | |
# In[405]: | |
contents31= [doc.page_content for doc in splits31] | |
metadata31= [doc.metadata for doc in splits31] | |
# In[406]: | |
embeddings31 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits31], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings31) | |
# In[407]: | |
ids31 = [str(uuid.uuid4()) for _ in range(len(contents31))] | |
# In[408]: | |
data.add( | |
documents=contents31, | |
embeddings=embeddings31, | |
metadatas=metadata31, | |
ids=ids31 | |
) | |
# In[409]: | |
append_data(contents31, metadata31, embeddings31) | |
# In[410]: | |
df | |
# # <p style="color: orange;">Document 31 INFORMATION sur master rechereche mathematique </p> | |
# | |
# In[412]: | |
loader = WebBaseLoader( | |
web_paths=("https://um.rnu.tn/fr/formations/formation-lmd/master/mat%C3%A8re-de-recherche-en-math%C3%A9matiques-fsm/",), | |
bs_kwargs=dict( | |
parse_only=bs4.SoupStrainer( | |
class_=("single-post-content single-content") | |
) | |
), | |
) | |
info_supp_mastere_math = loader.load() | |
# In[413]: | |
info_supp_mastere_math = [ | |
Document(page_content=clean_text(doc.page_content), metadata=doc.metadata) | |
for doc in info_supp_mastere_math] | |
info_supp_mastere_math | |
# ## spitting doc 31 into chunks | |
# In[415]: | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]) | |
splits32 = text_splitter.split_documents(info_supp_mastere_math) | |
# In[416]: | |
splits32 | |
# In[417]: | |
contents32= [doc.page_content for doc in splits32] | |
metadata32 = [doc.metadata for doc in splits32] | |
# In[418]: | |
embeddings32 = embeddings_model.embed_documents( | |
[doc.page_content for doc in splits32], | |
# normalize_embeddings=True, | |
# batch_size=256, | |
# show_progress_bar=True | |
) | |
print(embeddings32) | |
# In[419]: | |
ids32 = [str(uuid.uuid4()) for _ in range(len(contents32))] | |
# In[420]: | |
data.add( | |
documents=contents32, | |
embeddings=embeddings32, | |
metadatas=metadata32, | |
ids=ids32 | |
) | |
# In[421]: | |
append_data(contents32, metadata32, embeddings32) | |
data = data.get(include=['embeddings']) | |
print(data) | |
# In[427]: | |
if 'embeddings' in data: | |
embeddings_array = np.array(data['embeddings']) | |
print("Embeddings shape:", embeddings_array.shape) | |
else: | |
print("No embeddings found in vectorstore.") | |
# In[428]: | |
if embeddings_array.size > 0: | |
pca = PCA(n_components=2) | |
embeddings_2d = pca.fit_transform(embeddings_array) | |
# Plot embeddings | |
plt.figure(figsize=(8, 6)) | |
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7) | |
plt.xlabel("PCA 1") | |
plt.ylabel("PCA 2") | |
plt.title("2D Visualization of Embeddings") | |
plt.show() | |
else: | |
print("No embeddings available for PCA visualization.") | |
# # Manully testing to retrive 2st attempt just checking 👌 | |
# In[430]: | |
data = chroma_client.get_collection(name="my_dataaaa") | |
# In[431]: | |
query_embedding = embeddings_model.embed_query("Quelles sont les documents de stage obligatoire?") | |
results = data.query( | |
query_embeddings=[query_embedding], | |
n_results=50 | |
) | |
# In[432]: | |
results | |
# In[783]: | |
chroma_client = chromadb.PersistentClient(path="chroma_db") | |
collections = chroma_client.list_collections() | |
print("Available collections:", collections) | |
if "my_dataaaa" in collections: | |
collection = chroma_client.get_collection(name="my_dataaaa") | |
print(" Successfully loaded collection:", collection) | |
else: | |
print("Collection 'my_dataaaa' does not exist.", collections) | |
embeddings_model = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5") | |
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") | |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
def detect_intent(text): | |
result = classifier(text, candidate_labels=["question", "greeting", "small talk", "feedback", "thanks"]) | |
label = result["labels"][0] | |
return label.lower() | |
chroma_db_path = "./chroma_db" | |
chroma_client = chromadb.PersistentClient(path=chroma_db_path) | |
data = chroma_client.get_collection(name="my_dataaaa") | |
vectorstore = Chroma( | |
collection_name="my_dataaaa", | |
persist_directory="./chroma_db", | |
embedding_function=embeddings_model | |
) | |
#Create a retriever from chroma DATASTORE | |
retriever = vectorstore.as_retriever( | |
search_type="mmr", | |
search_kwargs={'k': 6, 'lambda_mult': 0.25} | |
) | |
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') | |
def rerank_docs(query, docs, top_k=50): | |
pairs = [(query, doc.page_content) for doc in docs] | |
scores = reranker.predict(pairs) | |
scored_docs = list(zip(docs, scores)) | |
scored_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True) | |
top_docs = [doc for doc, score in scored_docs[:top_k]] | |
return top_docs | |
custom_prompt = PromptTemplate.from_template(""" | |
You are a helpful assistant answering student questions based ONLY on the provided context. | |
You must read the entire context carefully and include all relevant information in your answer. | |
If multiple documents or requirements are mentioned, list them all clearly and completely. | |
If the answer is not found in the context, respond with: "je ne trouve pas la réponse." | |
Do not use your own knowledge for university-related questions. Only use what is in the context. | |
Structure the answer clearly and completely. Do not make any assumptions if the context does not have the answer. | |
Context: | |
{context} | |
Question: | |
{question} | |
Answer: | |
""") | |
llm = ChatOpenAI(model="gpt-3.5-turbo") | |
def format_docs(docs): | |
return "\n\n".join(doc.page_content for doc in docs) | |
context = format_docs(docs) | |
context | |
rag_chain = ( | |
{ | |
"context": retriever | |
| (lambda docs: rerank_docs(docs=docs, query="{question}")) | |
| format_docs, | |
"question": RunnablePassthrough() | |
} | |
| custom_prompt | |
| llm | |
| StrOutputParser() | |
) | |
PENDING_QUESTIONS_FILE = "pending_questions.json" | |
def store_pending_question(user_email, question): | |
q_id = str(uuid.uuid4()) | |
pending = { | |
"id": q_id, | |
"timestamp": datetime.utcnow().isoformat(), | |
"user_email": user_email, | |
"question": question | |
} | |
if os.path.exists(PENDING_QUESTIONS_FILE): | |
with open(PENDING_QUESTIONS_FILE, "r") as f: | |
data = json.load(f) | |
else: | |
data = [] | |
data.append(pending) | |
with open(PENDING_QUESTIONS_FILE, "w") as f: | |
json.dump(data, f, indent=4) | |
return q_id | |
def send_question_to_admin(user_email, user_question,question_id): | |
admin_email = "belhassen.esra@icloud.com" | |
smtp_server = "smtp.gmail.com" | |
smtp_port = 587 | |
sender_email = "fsmchatbot@gmail.com" | |
sender_password = os.getenv("BOT_EMAIL_PASSWORD") | |
subject = f"Nouvelle question [{question_id}] " | |
body = ( | |
f"Question ID: {question_id}\n" | |
f"Question posée :\n\n{user_question}" | |
) | |
message = MIMEMultipart() | |
message["From"] = sender_email | |
message["To"] = admin_email | |
message["Reply-To"] = "fsmchatbot@gmail.com" | |
message["Subject"] = subject | |
message.attach(MIMEText(body, "plain")) | |
try: | |
with smtplib.SMTP(smtp_server, smtp_port) as server: | |
server.starttls() | |
server.login(sender_email, sender_password) | |
server.sendmail(sender_email, admin_email, message.as_string()) | |
return True | |
except Exception as e: | |
print("Error sending email:", e) | |
return False | |
def university_related(question): | |
labels = ["university", "general knowledge"] | |
result = classifier(question, candidate_labels=labels) | |
top_label = result["labels"][0] | |
return top_label.lower() == "university" | |
def uncertain(answer): | |
uncertain_phrases = [ | |
"je ne trouve pas la réponse", | |
"désolé, je ne peux pas vous aider" | |
] | |
return any(phrase in answer.lower() for phrase in uncertain_phrases) or answer.strip() == "" | |
def handle_user_query(question, user_email=None): | |
# using the classifier model | |
intent = detect_intent(question.lower()) | |
if intent in ["greeting", "small talk"]: | |
return "Salut 👋 ! Posez-moi une question précise sur les procédures universitaires 😊." | |
if not university_related(question): | |
return "Merci de poser une question sur les procédures universitaires 😊" | |
# integration de RAG Pipeline | |
answer = rag_chain.invoke(question) | |
# making the llama know what to do if there are no relevant docs | |
if uncertain(answer): | |
if not user_email: | |
return ( | |
"Je ne trouve pas la réponse à cette question. " | |
"Veuillez me fournir votre adresse e-mail et la question en français pour que je puisse la transmettre à un administrateur.") | |
q_id = store_pending_question(user_email, question) | |
sent = send_question_to_admin(user_email, question, q_id) | |
if sent: | |
return "Votre question a été transmise à l'administration. Vous recevrez une réponse par e-mail dès que possible." | |
else: | |
return "Une erreur est survenue lors de l'envoi de votre question. Veuillez réessayer plus tard." | |
else: | |
return answer | |
user_email = "" | |
def chatbot_fn(message, history): | |
global user_email | |
if not user_email: | |
if "@gmail.com" in message or "@fsm.rnu.tn" in message: | |
user_email = message | |
return "Merci ! Maintenant, posez-moi votre question 😊" | |
else: | |
return "Bienvenue 👋 Veuillez entrer votre adresse e-mail pour commencer." | |
return handle_user_query(message, user_email) | |
with gr.Blocks() as chat: | |
gr.ChatInterface( | |
fn=chatbot_fn, | |
title="Chatbot Universitaire 🤖 🧠", | |
description="Commencez par entrer votre adresse e-mail. Ensuite, posez toutes vos questions sur les procédures universitaires !", | |
examples=[ | |
["Comment faire une demande de réinscription ?"], | |
["Quels sont les délais pour la soutenance ?"] | |
], | |
submit_btn="Envoyer" | |
) | |
gr.Markdown("© 2025 Esra Belhassen. All rights reserved") | |
chat.launch(share=True) | |