sashank812 commited on
Commit
368c287
Β·
verified Β·
1 Parent(s): 285e3b6

added files

Browse files
Files changed (3) hide show
  1. README.md +33 -13
  2. main.py +422 -0
  3. requirements.txt +17 -0
README.md CHANGED
@@ -1,13 +1,33 @@
1
- ---
2
- title: Multi Document Summarization
3
- emoji: 😻
4
- colorFrom: gray
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.25.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi Document Summarization
2
+
3
+ This project provides a multi-document summarization tool using state-of-the-art NLP models like BART and Longformer. It supports various file formats and generates summaries along with visualizations like dendrograms, t-SNE plots, TF-IDF plots, and word clouds.
4
+
5
+ ## Features
6
+ - Summarizes multiple documents into concise summaries.
7
+ - Supports file formats: `.docx`, `.txt`, `.html`, `.pdf`, `.csv`, `.xlsx`, `.json`, `.xml`, `.ppt`, `.pptx`.
8
+ - Visualizations: Dendrogram, t-SNE, TF-IDF, Word Cloud.
9
+
10
+ ## Installation
11
+ 1. Clone the repository:
12
+ ```bash
13
+ git clone https://github.com/your-username/abstractive-text-summarization.git
14
+ ```
15
+ 2. Navigate to the project directory:
16
+ ```bash
17
+ cd abstractive-text-summarization
18
+ ```
19
+ 3. Install dependencies:
20
+ ```bash
21
+ pip install -r requirements.txt
22
+ ```
23
+
24
+ ## Usage
25
+ 1. Run the application:
26
+ ```bash
27
+ python major_project_main.py
28
+ ```
29
+ 2. Open the Gradio interface in your browser.
30
+ 3. Upload files and click "Summarize" to generate summaries and visualizations.
31
+
32
+ ## License
33
+ This project is licensed under the MIT License. See the LICENSE file for details.
main.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import docx
3
+ from bs4 import BeautifulSoup
4
+ from pypdf import PdfReader
5
+
6
+ from sentence_transformers import SentenceTransformer, util
7
+
8
+ import warnings
9
+ import hdbscan
10
+
11
+ import numpy as np
12
+ import seaborn as sns
13
+
14
+ from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
15
+
16
+ import torch
17
+ from transformers import LongformerTokenizer, EncoderDecoderModel
18
+
19
+ import nltk
20
+ from nltk.corpus import stopwords
21
+ from nltk.tokenize import word_tokenize, sent_tokenize
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ import matplotlib.pyplot as plt
24
+
25
+ nltk.download("punkt_tab")
26
+ nltk.download("stopwords")
27
+ nltk.download("punkt")
28
+
29
+ import matplotlib.pyplot as plt
30
+ import scipy.cluster.hierarchy as sch
31
+ from sklearn.metrics.pairwise import cosine_similarity
32
+
33
+ import plotly.express as px
34
+ from sklearn.manifold import TSNE
35
+
36
+ from wordcloud import WordCloud
37
+ import matplotlib.pyplot as plt
38
+
39
+ import pandas as pd
40
+ import json
41
+ import xml.etree.ElementTree as ET
42
+ import os
43
+ import warnings
44
+ import pptx
45
+
46
+ import io
47
+ from PIL import Image
48
+
49
+ import comtypes.client
50
+ import comtypes
51
+
52
+ warnings.filterwarnings("ignore")
53
+
54
+
55
+ def clean_text(text):
56
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
57
+ text = re.sub(r"\s+", " ", text).strip()
58
+ text = re.sub(r"[^\w\s,.]", "", text)
59
+ return text
60
+
61
+
62
+ def extract_and_clean_text(file_path):
63
+ text = ""
64
+ if file_path.endswith(".docx"):
65
+ doc = docx.Document(file_path)
66
+ for paragraph in doc.paragraphs:
67
+ text += paragraph.text + " "
68
+ elif file_path.endswith(".txt"):
69
+ with open(file_path, "r", encoding="utf-8") as f:
70
+ text = f.read()
71
+ elif file_path.endswith((".html", ".htm")):
72
+ with open(file_path, "r", encoding="utf-8") as f:
73
+ html_content = f.read()
74
+ soup = BeautifulSoup(html_content, "html.parser")
75
+ text = soup.get_text(separator=" ", strip=True)
76
+ elif file_path.endswith(".pdf"):
77
+ reader = PdfReader(file_path)
78
+ for page in reader.pages:
79
+ text += page.extract_text() + " "
80
+ elif file_path.endswith(".csv"):
81
+ df = pd.read_csv(file_path)
82
+ text = " ".join(df.astype(str).agg(" ".join, axis=1))
83
+ elif file_path.endswith(".xlsx"):
84
+ df = pd.read_excel(file_path)
85
+ text = " ".join(df.astype(str).agg(" ".join, axis=1))
86
+ elif file_path.endswith(".json"):
87
+ with open(file_path, "r", encoding="utf-8") as f:
88
+ data = json.load(f)
89
+ text = " ".join([str(item) for item in data])
90
+ elif file_path.endswith(".xml"):
91
+ tree = ET.parse(file_path)
92
+ root = tree.getroot()
93
+ text = " ".join([elem.text for elem in root.iter() if elem.text])
94
+ elif file_path.endswith(".pptx"):
95
+ from pptx import Presentation
96
+
97
+ prs = Presentation(file_path)
98
+ for slide in prs.slides:
99
+ for shape in slide.shapes:
100
+ if hasattr(shape, "text"):
101
+ text += shape.text + " "
102
+ elif file_path.endswith(".ppt"):
103
+ comtypes.CoInitialize()
104
+ try:
105
+ powerpoint = comtypes.client.CreateObject("PowerPoint.Application")
106
+ powerpoint.Visible = 1
107
+ ppt = powerpoint.Presentations.Open(file_path)
108
+ pptx_path = file_path + "x" # Save as .pptx
109
+ ppt.SaveAs(pptx_path, 24) # 24 is the format for .pptx
110
+ ppt.Close()
111
+ powerpoint.Quit()
112
+ file_path = pptx_path # Update file_path to the new .pptx file
113
+ finally:
114
+ comtypes.CoUninitialize()
115
+ from pptx import Presentation
116
+
117
+ prs = Presentation(file_path)
118
+ for slide in prs.slides:
119
+ for shape in slide.shapes:
120
+ if hasattr(shape, "text"):
121
+ text += shape.text + " "
122
+ else:
123
+ raise ValueError("Unsupported file type: {}".format(file_path))
124
+ cleaned_text = clean_text(text)
125
+ return cleaned_text
126
+
127
+
128
+ def clean_files(file_list):
129
+ cleaned_files = []
130
+ for file in file_list:
131
+ cleaned_files.append(extract_and_clean_text(file))
132
+ return cleaned_files
133
+
134
+
135
+ def get_embeddings(text):
136
+ model = SentenceTransformer("all-mpnet-base-v2")
137
+ embeddings = model.encode(text)
138
+ return embeddings
139
+
140
+
141
+ def clustering_labels(embeddings):
142
+ warnings.filterwarnings("ignore")
143
+ embeddings = np.array(embeddings)
144
+ if len(embeddings) < 2:
145
+ raise ValueError(
146
+ "Not enough data points for clustering. At least 2 are required."
147
+ )
148
+ min_cluster_size = min(2, len(embeddings))
149
+ cluster = hdbscan.HDBSCAN(
150
+ min_cluster_size=min_cluster_size,
151
+ metric="euclidean",
152
+ cluster_selection_method="eom",
153
+ ).fit(embeddings)
154
+ return cluster.labels_
155
+
156
+
157
+ def bart_summarizer(text):
158
+ model_name_bart = "facebook/bart-large-cnn"
159
+ tokenizer = BartTokenizer.from_pretrained(model_name_bart)
160
+ model = BartForConditionalGeneration.from_pretrained(model_name_bart)
161
+ tokenize_inputs = tokenizer.encode(
162
+ text, return_tensors="pt", max_length=1024, truncation=True
163
+ )
164
+ ids_summarization = model.generate(
165
+ tokenize_inputs, num_beams=4, max_length=150, early_stopping=True
166
+ )
167
+ summary_decoded = tokenizer.decode(ids_summarization[0], skip_special_tokens=True)
168
+ return summary_decoded
169
+
170
+
171
+ def longformer_summarizer(text):
172
+ tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
173
+ model = EncoderDecoderModel.from_pretrained(
174
+ "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16"
175
+ )
176
+ inputs = tokenizer(
177
+ text, return_tensors="pt", padding="longest", truncation=True
178
+ ).input_ids
179
+ ids_summarization = model.generate(inputs)
180
+ summary_decoded = tokenizer.decode(ids_summarization[0], skip_special_tokens=True)
181
+ return summary_decoded
182
+
183
+
184
+ def longformer_summarizer_long_text(
185
+ text, max_chunk_length=4000, overlap=200, max_summary_length=1024
186
+ ):
187
+ tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
188
+ model = EncoderDecoderModel.from_pretrained(
189
+ "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16"
190
+ )
191
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
192
+ model = model.to(device)
193
+ tokens = tokenizer.encode(text)
194
+ if len(tokens) <= max_chunk_length:
195
+ inputs = tokenizer(text, return_tensors="pt", padding="longest").input_ids.to(
196
+ device
197
+ )
198
+ summary_ids = model.generate(inputs, max_length=max_summary_length)
199
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
200
+ return summary
201
+ chunk_summaries = []
202
+ for i in range(0, len(tokens), max_chunk_length - overlap):
203
+ chunk_tokens = tokens[i : i + max_chunk_length]
204
+ if len(chunk_tokens) < 100:
205
+ continue
206
+ chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
207
+ inputs = tokenizer(
208
+ chunk_text, return_tensors="pt", padding="longest"
209
+ ).input_ids.to(device)
210
+ summary_ids = model.generate(inputs, max_length=max_summary_length // 2)
211
+ chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
212
+ chunk_summaries.append(chunk_summary)
213
+ final_summary = " ".join(chunk_summaries)
214
+ return final_summary
215
+
216
+
217
+ def summarize_text(text):
218
+ bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
219
+ input_length = len(bart_tokenizer.encode(text))
220
+ if input_length < 1024:
221
+ summary = bart_summarizer(text)
222
+ elif input_length < 4096:
223
+ summary = longformer_summarizer(text)
224
+ else:
225
+ summary = longformer_summarizer_long_text(text)
226
+ return summary
227
+
228
+
229
+ def summarize(embeddings, labels, cleaned_files):
230
+ no_of_clusters = max(labels) + 1
231
+ clusters_embeddings = []
232
+ clusters_text = [""] * no_of_clusters
233
+ for i in range(no_of_clusters):
234
+ clusters_embeddings.append(embeddings[labels == i])
235
+ noise_docs = []
236
+ for label, text_chunk in zip(labels, cleaned_files):
237
+ if label != -1:
238
+ clusters_text[label] += text_chunk
239
+ else:
240
+ noise_docs.append(text_chunk)
241
+ clusters_text.extend(noise_docs)
242
+ cluster_texts_combined = ["".join(cluster) for cluster in clusters_text]
243
+ final_summaries = [
244
+ summarize_text(cluster_text) for cluster_text in cluster_texts_combined
245
+ ]
246
+ return final_summaries
247
+
248
+
249
+ def tfidf_plot(all_text):
250
+ tokens = word_tokenize(all_text.lower())
251
+ stop_words = set(stopwords.words("english"))
252
+ filtered_tokens = [w for w in tokens if not w in stop_words and w.isalnum()]
253
+ vectorizer = TfidfVectorizer()
254
+ tfidf_matrix = vectorizer.fit_transform([" ".join(filtered_tokens)])
255
+ feature_names = vectorizer.get_feature_names_out()
256
+ tfidf_scores = tfidf_matrix.toarray()[0]
257
+ top_n = 25
258
+ top_indices = tfidf_scores.argsort()[-top_n:]
259
+ top_words = [feature_names[i] for i in top_indices]
260
+ top_scores = [tfidf_scores[i] for i in top_indices]
261
+ fig, ax = plt.subplots(figsize=(10, 5))
262
+ ax.barh(top_words, top_scores, color="skyblue")
263
+ ax.set_xlabel("TF-IDF Score")
264
+ ax.set_ylabel("Words")
265
+ ax.set_title("Top {} Important Words (TF-IDF)".format(top_n))
266
+ ax.invert_yaxis()
267
+ return fig
268
+
269
+
270
+ def dendrogram_plot(embeddings, labels):
271
+ similarity_matrix = cosine_similarity(embeddings)
272
+ distance_matrix = 1 - similarity_matrix
273
+ linkage_matrix = sch.linkage(distance_matrix, method="ward")
274
+ dendrogram_labels = [
275
+ f"Doc {i} (Cluster {labels[i]})" if labels[i] != -1 else f"Doc {i} (Noise)"
276
+ for i in range(len(labels))
277
+ ]
278
+ fig, ax = plt.subplots(figsize=(12, 8))
279
+ sch.dendrogram(
280
+ linkage_matrix,
281
+ labels=dendrogram_labels,
282
+ orientation="right",
283
+ leaf_font_size=10,
284
+ ax=ax,
285
+ )
286
+ ax.set_title("Hierarchical Dendrogram of Document Clusters", fontsize=14)
287
+ ax.set_xlabel("Distance", fontsize=12)
288
+ ax.set_ylabel("Documents", fontsize=12)
289
+ plt.tight_layout()
290
+ return fig
291
+
292
+
293
+ def tsne_plot(embeddings, labels):
294
+ n_samples = len(embeddings)
295
+ if n_samples < 2:
296
+ fig, ax = plt.subplots(figsize=(6, 4))
297
+ ax.text(
298
+ 0.5,
299
+ 0.5,
300
+ "t-SNE plot is not applicable for a single document.",
301
+ fontsize=12,
302
+ ha="center",
303
+ va="center",
304
+ wrap=True,
305
+ )
306
+ ax.axis("off")
307
+ return fig
308
+ perplexity = min(30, n_samples - 1)
309
+ tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
310
+ reduced_embeddings = tsne.fit_transform(embeddings)
311
+ fig, ax = plt.subplots(figsize=(8, 6))
312
+ scatter = ax.scatter(
313
+ reduced_embeddings[:, 0],
314
+ reduced_embeddings[:, 1],
315
+ c=labels,
316
+ cmap="viridis",
317
+ s=50,
318
+ alpha=0.8,
319
+ )
320
+ ax.set_title("t-SNE Visualization of Document Clusters", fontsize=14)
321
+ ax.set_xlabel("t-SNE Dimension 1", fontsize=12)
322
+ ax.set_ylabel("t-SNE Dimension 2", fontsize=12)
323
+ cbar = plt.colorbar(scatter, ax=ax)
324
+ cbar.set_label("Cluster Labels", fontsize=12)
325
+ return fig
326
+
327
+
328
+ def wordcloud_plot(all_text):
329
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(
330
+ all_text
331
+ )
332
+ fig, ax = plt.subplots(figsize=(10, 5), facecolor=None)
333
+ ax.imshow(wordcloud)
334
+ ax.axis("off")
335
+ plt.tight_layout(pad=0)
336
+ buf = io.BytesIO()
337
+ fig.savefig(buf, format="png")
338
+ buf.seek(0)
339
+ img = Image.open(buf)
340
+ img_array = np.array(img)
341
+ buf.close()
342
+ plt.close(fig)
343
+ return img_array
344
+
345
+
346
+ def summarize_docs(files_text):
347
+ if files_text:
348
+ cleaned_files = clean_files(files_text)
349
+ if len(cleaned_files) == 1:
350
+ summary = summarize_text(cleaned_files[0])
351
+ return (
352
+ f"Summary for the uploaded document:\n{summary}",
353
+ None,
354
+ None,
355
+ None,
356
+ None,
357
+ )
358
+ embeddings = get_embeddings(cleaned_files)
359
+ if len(embeddings) < 2:
360
+ return (
361
+ "Not enough documents for clustering. Please upload more files.",
362
+ None,
363
+ None,
364
+ None,
365
+ None,
366
+ )
367
+ labels = clustering_labels(embeddings)
368
+ summaries = summarize(embeddings, labels, cleaned_files)
369
+ summary_output = "\n".join(
370
+ [
371
+ f"β€’ Summary for cluster/doc {i+1}:\n{summary}"
372
+ for i, summary in enumerate(summaries)
373
+ ]
374
+ )
375
+ all_text = " ".join(cleaned_files)
376
+ tfidf_fig = tfidf_plot(all_text) # Get the tfidf plot figure
377
+ dendrogram_fig = dendrogram_plot(
378
+ embeddings, labels
379
+ ) # Get the dendrogram plot figure
380
+ tsne_fig = tsne_plot(embeddings, labels) # Get the t-sne plot figure
381
+ wordcloud_fig = wordcloud_plot(all_text) # Get the wordcloud plot figure
382
+ return summary_output, tfidf_fig, dendrogram_fig, tsne_fig, wordcloud_fig
383
+ else:
384
+ return "No files uploaded.", None, None, None, None
385
+
386
+
387
+ import gradio as gr
388
+
389
+ with gr.Blocks() as demo:
390
+ gr.Markdown("# πŸ“° Multi-Document Summarization")
391
+
392
+ with gr.Row():
393
+ with gr.Column():
394
+ file_upload = gr.Files(label="Upload Your Files")
395
+ gr.Markdown(
396
+ "### Supported File Types: πŸ“„ `.docx` πŸ“ `.txt` 🌐 `.html` πŸ“‘ `.pdf` πŸ“Š `.csv` πŸ“ˆ `.xlsx` πŸ—‚ `.json` πŸ—ƒ `.xml` 🎞 `.ppt/.pptx`",
397
+ elem_id="file-types-info",
398
+ )
399
+ summarize_btn = gr.Button("Summarize")
400
+
401
+ with gr.Column():
402
+ summary_output = gr.Textbox(label="β€’ Bullet List of Summaries", lines=10)
403
+
404
+ gr.Markdown("## πŸ“Š Visualizations")
405
+
406
+ with gr.Row():
407
+ dendro = gr.Plot(label="Dendrogram")
408
+ tsne = gr.Plot(label="t-SNE")
409
+
410
+ with gr.Row():
411
+ tfidf = gr.Plot(label="TF-IDF")
412
+
413
+ with gr.Row():
414
+ wordcloud = gr.Image(label="Word Cloud")
415
+
416
+ summarize_btn.click(
417
+ summarize_docs,
418
+ inputs=file_upload,
419
+ outputs=[summary_output, tfidf, dendro, tsne, wordcloud],
420
+ )
421
+
422
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ hdbscan
4
+ nltk
5
+ scikit-learn
6
+ matplotlib
7
+ seaborn
8
+ plotly
9
+ wordcloud
10
+ pandas
11
+ openpyxl
12
+ python-pptx
13
+ pillow
14
+ comtypes
15
+ bs4
16
+ PyPDF2
17
+ gradiopython