Spaces:
Runtime error
Runtime error
Qazi-Mudassar-Ilyas
commited on
Commit
•
72ed163
1
Parent(s):
32e4aa1
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fitz
|
3 |
+
from dotenv import find_dotenv, load_dotenv
|
4 |
+
import gradio as gr
|
5 |
+
from pathlib import Path
|
6 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
7 |
+
|
8 |
+
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
10 |
+
|
11 |
+
from pptx import Presentation
|
12 |
+
|
13 |
+
_=load_dotenv(find_dotenv())
|
14 |
+
hf_api = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
15 |
+
llm=HuggingFaceEndpoint(repo_id="Mistralai/Mistral-7B-Instruct-v0.2", temperature=0.1, max_new_tokens=1000)
|
16 |
+
|
17 |
+
def load_file (input_file):
|
18 |
+
pages=[]
|
19 |
+
loader = PyMuPDFLoader(input_file)
|
20 |
+
documents = loader.load()
|
21 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
22 |
+
pages = text_splitter.split_documents(documents)
|
23 |
+
|
24 |
+
#limit to five pages if pages > 5
|
25 |
+
pdf_document = fitz.open(input_file)
|
26 |
+
pdf_writer = fitz.open()
|
27 |
+
total_pages = pdf_document.page_count
|
28 |
+
if total_pages > 5:
|
29 |
+
pages=pages[:5]
|
30 |
+
return pages
|
31 |
+
|
32 |
+
def predict (text,num_bullets):
|
33 |
+
prompt= f"You are an expert in making presentatinos with excellent titles and summarized content in lines. Give a title (max 5 words) and summary containing a maximum of {num_bullets} lines in list format. Do not append bullets to summary lines: {text}"
|
34 |
+
answer = llm.invoke(prompt)
|
35 |
+
return answer
|
36 |
+
|
37 |
+
def extract_title_and_summary(answer):
|
38 |
+
# Provided text
|
39 |
+
text = answer
|
40 |
+
|
41 |
+
# Splitting text into lines
|
42 |
+
lines = text.strip().split('\n')
|
43 |
+
|
44 |
+
# Initialize title and summary variables
|
45 |
+
title = None
|
46 |
+
summary = []
|
47 |
+
|
48 |
+
# Iterate through the lines
|
49 |
+
for line in lines:
|
50 |
+
# Check if the line contains "Title:"
|
51 |
+
if "Title:" in line:
|
52 |
+
# Extract title
|
53 |
+
title = line.split(":", 1)[1].strip()
|
54 |
+
# Check if the line contains "Summary:"
|
55 |
+
elif "Summary:" in line:
|
56 |
+
# Extract summary lines
|
57 |
+
summary = [line.split(":", 1)[1].strip()]
|
58 |
+
# If we've already found the title, and the line is not empty, add it to the summary
|
59 |
+
elif title is not None and line.strip() != "":
|
60 |
+
summary.append(line.strip())
|
61 |
+
|
62 |
+
# Join summary lines into a single string
|
63 |
+
summary = '\n'.join(summary)
|
64 |
+
return title, summary
|
65 |
+
|
66 |
+
def generate_presentation(input_file, num_slides,num_bullets, progress=gr.Progress()):
|
67 |
+
|
68 |
+
pages=load_file (input_file)
|
69 |
+
|
70 |
+
cps = len(pages) / num_slides
|
71 |
+
chunks_per_slide = int (cps)
|
72 |
+
|
73 |
+
if chunks_per_slide == 0:
|
74 |
+
chunks_per_slide=1
|
75 |
+
|
76 |
+
#extract page content from pdf pages splits
|
77 |
+
chunks = [item.page_content for item in pages]
|
78 |
+
|
79 |
+
prs = Presentation() # Generate and empty presentation
|
80 |
+
|
81 |
+
#the heart of the method that iterates through all chunks, concatenates them, calls LLM, and generates slides
|
82 |
+
for i in range(0, len(chunks), chunks_per_slide):
|
83 |
+
# Update progress on UI
|
84 |
+
description=f"Generating slide: {i+1}"
|
85 |
+
progress ((i+1)/num_slides, desc= description)
|
86 |
+
|
87 |
+
#Concatenate chunks to map no. of pages with no. of slides required
|
88 |
+
concatenated_chunks= ""
|
89 |
+
group_of_chunks = chunks[i:i+chunks_per_slide]
|
90 |
+
concatenated_chunks = '\n\n'.join(group_of_chunks)
|
91 |
+
|
92 |
+
#call the LLM
|
93 |
+
answer=predict(concatenated_chunks,num_bullets)
|
94 |
+
title, summary = extract_title_and_summary(answer)
|
95 |
+
|
96 |
+
#add new slide
|
97 |
+
new_slide = prs.slides.add_slide(prs.slide_layouts[1])
|
98 |
+
title_1 = new_slide.shapes.title
|
99 |
+
if title is not None:
|
100 |
+
title_1.text = title
|
101 |
+
content_1 = new_slide.placeholders[1]
|
102 |
+
if summary is not None:
|
103 |
+
content_1.text = summary
|
104 |
+
|
105 |
+
# save the presentation and return
|
106 |
+
input_file=Path(input_file)
|
107 |
+
pres_path=f'./{input_file.stem}.pptx'
|
108 |
+
prs.save(pres_path)
|
109 |
+
return pres_path
|
110 |
+
|
111 |
+
with gr.Blocks() as demo:
|
112 |
+
gr.Markdown(
|
113 |
+
"""
|
114 |
+
# PDF2PPTX
|
115 |
+
"""
|
116 |
+
)
|
117 |
+
with gr.Column():
|
118 |
+
input_file = gr.File(label='Upload your PDF file...', file_count='single', file_types=['.pdf'])
|
119 |
+
num_slides=gr.Slider (label= "Number of slides", interactive=True,minimum=1, maximum=50, value=5, step=1)
|
120 |
+
num_bullets=gr.Slider(label= "Number of bullets per slides", interactive=True, minimum=1, maximum=10, value=5, step=1)
|
121 |
+
fileuploadbtn= gr.Button ("Generate Presentation")
|
122 |
+
presentation = gr.File(label="Your Presentation", interactive=False)
|
123 |
+
|
124 |
+
gr.Markdown(
|
125 |
+
"""
|
126 |
+
# Responsible AI Usage
|
127 |
+
Your documents uploaded to the system or presentations generated are not saved.
|
128 |
+
"""
|
129 |
+
)
|
130 |
+
|
131 |
+
fileuploadbtn.click(fn=generate_presentation, inputs=[input_file, num_slides,num_bullets], outputs=[presentation])
|
132 |
+
if __name__ == "__main__":
|
133 |
+
demo.launch()
|
134 |
+
|
135 |
+
|