stivenDR14
commited on
Commit
·
07c0a81
1
Parent(s):
c5332dd
update method for summary and models
Browse files- app.py +4 -4
- pdf_processor.py +67 -11
- utils.py +2 -2
app.py
CHANGED
|
@@ -146,13 +146,13 @@ class PDFProcessorUI:
|
|
| 146 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
| 147 |
lines=10
|
| 148 |
)
|
| 149 |
-
summary_output = gr.Textbox(
|
| 150 |
-
label=TRANSLATIONS[self.current_language]["summary_label"],
|
| 151 |
-
lines=10
|
| 152 |
-
)
|
| 153 |
summarize_btn = gr.Button(
|
| 154 |
TRANSLATIONS[self.current_language]["generate_summary"]
|
| 155 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
with specialist_tab:
|
| 158 |
specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
|
|
|
|
| 146 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
| 147 |
lines=10
|
| 148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
summarize_btn = gr.Button(
|
| 150 |
TRANSLATIONS[self.current_language]["generate_summary"]
|
| 151 |
)
|
| 152 |
+
summary_output = gr.Markdown(
|
| 153 |
+
label=TRANSLATIONS[self.current_language]["summary_label"],
|
| 154 |
+
height=400
|
| 155 |
+
)
|
| 156 |
|
| 157 |
with specialist_tab:
|
| 158 |
specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
|
pdf_processor.py
CHANGED
|
@@ -17,6 +17,8 @@ import requests
|
|
| 17 |
import os
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
import re
|
|
|
|
|
|
|
| 20 |
|
| 21 |
OLLAMA_LLM = "granite3.1-dense"
|
| 22 |
OLLAMA_EMBEDDINGS = "granite-embedding:278m"
|
|
@@ -228,21 +230,74 @@ class PDFProcessor:
|
|
| 228 |
|
| 229 |
return result["result"] + "\n\nSources: " + page_labels_text
|
| 230 |
|
| 231 |
-
def
|
| 232 |
-
print("Summarizer by k
|
| 233 |
if not vectorstore:
|
| 234 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 235 |
|
| 236 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 237 |
-
# Get all documents from the vectorstore
|
| 238 |
-
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
| 239 |
-
documents = retriever.invoke('Summary of the document and key points')
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
summary_chain = summary_prompt | current_llm
|
| 245 |
-
final_summary = summary_chain.invoke({"texts":
|
|
|
|
| 246 |
return final_summary
|
| 247 |
|
| 248 |
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
|
@@ -250,18 +305,19 @@ class PDFProcessor:
|
|
| 250 |
final_summary_prompt = PromptTemplate(
|
| 251 |
input_variables=["texts", "language"],
|
| 252 |
template="""
|
| 253 |
-
Combine the following texts into a cohesive and structured
|
| 254 |
------------
|
| 255 |
{texts}
|
| 256 |
------------
|
| 257 |
-
The final summary should be between 2 and 4 paragraphs.
|
| 258 |
Preserve the original meaning without adding external information or interpretations.
|
| 259 |
Ensure clarity, logical flow, and coherence between the combined points.
|
| 260 |
The summary must be in {language}.
|
|
|
|
|
|
|
| 261 |
"""
|
| 262 |
)
|
| 263 |
|
| 264 |
-
return self.
|
| 265 |
|
| 266 |
|
| 267 |
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
|
|
|
| 17 |
import os
|
| 18 |
from dotenv import load_dotenv
|
| 19 |
import re
|
| 20 |
+
from sklearn.cluster import KMeans
|
| 21 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 22 |
|
| 23 |
OLLAMA_LLM = "granite3.1-dense"
|
| 24 |
OLLAMA_EMBEDDINGS = "granite-embedding:278m"
|
|
|
|
| 230 |
|
| 231 |
return result["result"] + "\n\nSources: " + page_labels_text
|
| 232 |
|
| 233 |
+
def summarizer_by_k_means(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
| 234 |
+
print("Summarizer by k means in language: ", self.language)
|
| 235 |
if not vectorstore:
|
| 236 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 237 |
|
| 238 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
+
# Get all the documents from the vectorstore
|
| 241 |
+
documents = vectorstore.get(include=["embeddings", "documents"])
|
| 242 |
+
documentsByIds = documents["ids"]
|
| 243 |
+
documentsByEmbeddings = documents["embeddings"]
|
| 244 |
+
documentsByDocuments = documents["documents"]
|
| 245 |
+
|
| 246 |
+
print("documents length: ", len(documentsByEmbeddings))
|
| 247 |
+
|
| 248 |
+
#depending on the length of the documents, create a number of clusters, if is les than 12, create 3 clusters, if is les than 36, create 6 clusters, if is less than 108, create 12 clusters, else create 24 clusters
|
| 249 |
+
number_for_CreateClusters = 2
|
| 250 |
+
if len(documentsByEmbeddings) <= 16:
|
| 251 |
+
number_for_CreateClusters = 2
|
| 252 |
+
elif len(documentsByEmbeddings) <= 64:
|
| 253 |
+
number_for_CreateClusters = 4
|
| 254 |
+
elif len(documentsByEmbeddings) <= 128:
|
| 255 |
+
number_for_CreateClusters = 8
|
| 256 |
+
else:
|
| 257 |
+
number_for_CreateClusters = 12
|
| 258 |
|
| 259 |
+
num_clusters = max(1, len(documentsByEmbeddings) // number_for_CreateClusters)
|
| 260 |
+
|
| 261 |
+
print("num_clusters: ", num_clusters)
|
| 262 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
| 263 |
+
kmeans.fit(documentsByEmbeddings)
|
| 264 |
+
|
| 265 |
+
summary_documents = []
|
| 266 |
+
map_ids_documents = {}
|
| 267 |
+
#for each cluster, choose the document embedding with the highest similarity to the centroid, based on numpy cosine similarity, and keep a map of ids of the documents
|
| 268 |
+
for i in range(num_clusters):
|
| 269 |
+
# Get the indices of the documents in the cluster
|
| 270 |
+
cluster_indices = [j for j, label in enumerate(kmeans.labels_) if label == i]
|
| 271 |
+
|
| 272 |
+
if not cluster_indices: # If there are no documents in this cluster, continue
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
# Get the embeddings of the documents in this cluster
|
| 276 |
+
cluster_embeddings = [documentsByEmbeddings[j] for j in cluster_indices]
|
| 277 |
+
|
| 278 |
+
# Calculate the similarity with the centroid
|
| 279 |
+
centroid = kmeans.cluster_centers_[i]
|
| 280 |
+
similarities = [cosine_similarity([embedding], [centroid])[0][0] for embedding in cluster_embeddings]
|
| 281 |
+
|
| 282 |
+
# Find the most similar document to the centroid
|
| 283 |
+
most_similar_index = cluster_indices[similarities.index(max(similarities))]
|
| 284 |
+
|
| 285 |
+
# Add the most similar document to the summary list
|
| 286 |
+
summary_documents.append(documentsByDocuments[most_similar_index])
|
| 287 |
+
map_ids_documents[most_similar_index] = documentsByIds[most_similar_index]
|
| 288 |
+
|
| 289 |
+
print("map_ids_documents: ", map_ids_documents)
|
| 290 |
+
|
| 291 |
+
# Join the summary documents into a single string
|
| 292 |
+
summary_text = "\n".join(summary_documents)
|
| 293 |
+
print("summary_documents: ", summary_text)
|
| 294 |
+
|
| 295 |
+
if just_get_documments:
|
| 296 |
+
return summary_text
|
| 297 |
+
|
| 298 |
summary_chain = summary_prompt | current_llm
|
| 299 |
+
final_summary = summary_chain.invoke({"texts": summary_text, "language": self.language})
|
| 300 |
+
|
| 301 |
return final_summary
|
| 302 |
|
| 303 |
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
|
|
|
| 305 |
final_summary_prompt = PromptTemplate(
|
| 306 |
input_variables=["texts", "language"],
|
| 307 |
template="""
|
| 308 |
+
Combine the following texts into a cohesive and structured summary:
|
| 309 |
------------
|
| 310 |
{texts}
|
| 311 |
------------
|
|
|
|
| 312 |
Preserve the original meaning without adding external information or interpretations.
|
| 313 |
Ensure clarity, logical flow, and coherence between the combined points.
|
| 314 |
The summary must be in {language}.
|
| 315 |
+
The output must be in markdown format.
|
| 316 |
+
Output:
|
| 317 |
"""
|
| 318 |
)
|
| 319 |
|
| 320 |
+
return self.summarizer_by_k_means(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
| 321 |
|
| 322 |
|
| 323 |
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
utils.py
CHANGED
|
@@ -10,7 +10,7 @@ if ENVIRONMENT == "dev":
|
|
| 10 |
AI_MODELS = {
|
| 11 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
| 12 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
| 13 |
-
"Huggingface / Google Gemma
|
| 14 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
| 15 |
"IBM Granite3.1 dense / Ollama local": "ollama",
|
| 16 |
"Open AI / GPT-4o-mini": "openai",
|
|
@@ -19,7 +19,7 @@ else:
|
|
| 19 |
AI_MODELS = {
|
| 20 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
| 21 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
| 22 |
-
"Huggingface / Google Gemma
|
| 23 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
| 24 |
"Open AI / GPT-4o-mini": "openai",
|
| 25 |
}
|
|
|
|
| 10 |
AI_MODELS = {
|
| 11 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
| 12 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
| 13 |
+
"Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
|
| 14 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
| 15 |
"IBM Granite3.1 dense / Ollama local": "ollama",
|
| 16 |
"Open AI / GPT-4o-mini": "openai",
|
|
|
|
| 19 |
AI_MODELS = {
|
| 20 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
| 21 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
| 22 |
+
"Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
|
| 23 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
| 24 |
"Open AI / GPT-4o-mini": "openai",
|
| 25 |
}
|