PDF2PPTX / app.py
Qazi-Mudassar-Ilyas's picture
Create app.py
72ed163 verified
import os
import fitz
from dotenv import find_dotenv, load_dotenv
import gradio as gr
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import HuggingFaceEndpoint
from pptx import Presentation
_=load_dotenv(find_dotenv())
hf_api = os.getenv("HUGGINGFACEHUB_API_TOKEN")
llm=HuggingFaceEndpoint(repo_id="Mistralai/Mistral-7B-Instruct-v0.2", temperature=0.1, max_new_tokens=1000)
def load_file (input_file):
pages=[]
loader = PyMuPDFLoader(input_file)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
pages = text_splitter.split_documents(documents)
#limit to five pages if pages > 5
pdf_document = fitz.open(input_file)
pdf_writer = fitz.open()
total_pages = pdf_document.page_count
if total_pages > 5:
pages=pages[:5]
return pages
def predict (text,num_bullets):
prompt= f"You are an expert in making presentatinos with excellent titles and summarized content in lines. Give a title (max 5 words) and summary containing a maximum of {num_bullets} lines in list format. Do not append bullets to summary lines: {text}"
answer = llm.invoke(prompt)
return answer
def extract_title_and_summary(answer):
# Provided text
text = answer
# Splitting text into lines
lines = text.strip().split('\n')
# Initialize title and summary variables
title = None
summary = []
# Iterate through the lines
for line in lines:
# Check if the line contains "Title:"
if "Title:" in line:
# Extract title
title = line.split(":", 1)[1].strip()
# Check if the line contains "Summary:"
elif "Summary:" in line:
# Extract summary lines
summary = [line.split(":", 1)[1].strip()]
# If we've already found the title, and the line is not empty, add it to the summary
elif title is not None and line.strip() != "":
summary.append(line.strip())
# Join summary lines into a single string
summary = '\n'.join(summary)
return title, summary
def generate_presentation(input_file, num_slides,num_bullets, progress=gr.Progress()):
pages=load_file (input_file)
cps = len(pages) / num_slides
chunks_per_slide = int (cps)
if chunks_per_slide == 0:
chunks_per_slide=1
#extract page content from pdf pages splits
chunks = [item.page_content for item in pages]
prs = Presentation() # Generate and empty presentation
#the heart of the method that iterates through all chunks, concatenates them, calls LLM, and generates slides
for i in range(0, len(chunks), chunks_per_slide):
# Update progress on UI
description=f"Generating slide: {i+1}"
progress ((i+1)/num_slides, desc= description)
#Concatenate chunks to map no. of pages with no. of slides required
concatenated_chunks= ""
group_of_chunks = chunks[i:i+chunks_per_slide]
concatenated_chunks = '\n\n'.join(group_of_chunks)
#call the LLM
answer=predict(concatenated_chunks,num_bullets)
title, summary = extract_title_and_summary(answer)
#add new slide
new_slide = prs.slides.add_slide(prs.slide_layouts[1])
title_1 = new_slide.shapes.title
if title is not None:
title_1.text = title
content_1 = new_slide.placeholders[1]
if summary is not None:
content_1.text = summary
# save the presentation and return
input_file=Path(input_file)
pres_path=f'./{input_file.stem}.pptx'
prs.save(pres_path)
return pres_path
with gr.Blocks() as demo:
gr.Markdown(
"""
# PDF2PPTX
"""
)
with gr.Column():
input_file = gr.File(label='Upload your PDF file...', file_count='single', file_types=['.pdf'])
num_slides=gr.Slider (label= "Number of slides", interactive=True,minimum=1, maximum=50, value=5, step=1)
num_bullets=gr.Slider(label= "Number of bullets per slides", interactive=True, minimum=1, maximum=10, value=5, step=1)
fileuploadbtn= gr.Button ("Generate Presentation")
presentation = gr.File(label="Your Presentation", interactive=False)
gr.Markdown(
"""
# Responsible AI Usage
Your documents uploaded to the system or presentations generated are not saved.
"""
)
fileuploadbtn.click(fn=generate_presentation, inputs=[input_file, num_slides,num_bullets], outputs=[presentation])
if __name__ == "__main__":
demo.launch()