import os import fitz from dotenv import find_dotenv, load_dotenv import gradio as gr from pathlib import Path from langchain_community.document_loaders import PyMuPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain_community.llms import HuggingFaceEndpoint from pptx import Presentation _=load_dotenv(find_dotenv()) hf_api = os.getenv("HUGGINGFACEHUB_API_TOKEN") llm=HuggingFaceEndpoint(repo_id="Mistralai/Mistral-7B-Instruct-v0.2", temperature=0.1, max_new_tokens=1000) def load_file (input_file): pages=[] loader = PyMuPDFLoader(input_file) documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) pages = text_splitter.split_documents(documents) #limit to five pages if pages > 5 pdf_document = fitz.open(input_file) pdf_writer = fitz.open() total_pages = pdf_document.page_count if total_pages > 5: pages=pages[:5] return pages def predict (text,num_bullets): prompt= f"You are an expert in making presentatinos with excellent titles and summarized content in lines. Give a title (max 5 words) and summary containing a maximum of {num_bullets} lines in list format. Do not append bullets to summary lines: {text}" answer = llm.invoke(prompt) return answer def extract_title_and_summary(answer): # Provided text text = answer # Splitting text into lines lines = text.strip().split('\n') # Initialize title and summary variables title = None summary = [] # Iterate through the lines for line in lines: # Check if the line contains "Title:" if "Title:" in line: # Extract title title = line.split(":", 1)[1].strip() # Check if the line contains "Summary:" elif "Summary:" in line: # Extract summary lines summary = [line.split(":", 1)[1].strip()] # If we've already found the title, and the line is not empty, add it to the summary elif title is not None and line.strip() != "": summary.append(line.strip()) # Join summary lines into a single string summary = '\n'.join(summary) return title, summary def generate_presentation(input_file, num_slides,num_bullets, progress=gr.Progress()): pages=load_file (input_file) cps = len(pages) / num_slides chunks_per_slide = int (cps) if chunks_per_slide == 0: chunks_per_slide=1 #extract page content from pdf pages splits chunks = [item.page_content for item in pages] prs = Presentation() # Generate and empty presentation #the heart of the method that iterates through all chunks, concatenates them, calls LLM, and generates slides for i in range(0, len(chunks), chunks_per_slide): # Update progress on UI description=f"Generating slide: {i+1}" progress ((i+1)/num_slides, desc= description) #Concatenate chunks to map no. of pages with no. of slides required concatenated_chunks= "" group_of_chunks = chunks[i:i+chunks_per_slide] concatenated_chunks = '\n\n'.join(group_of_chunks) #call the LLM answer=predict(concatenated_chunks,num_bullets) title, summary = extract_title_and_summary(answer) #add new slide new_slide = prs.slides.add_slide(prs.slide_layouts[1]) title_1 = new_slide.shapes.title if title is not None: title_1.text = title content_1 = new_slide.placeholders[1] if summary is not None: content_1.text = summary # save the presentation and return input_file=Path(input_file) pres_path=f'./{input_file.stem}.pptx' prs.save(pres_path) return pres_path with gr.Blocks() as demo: gr.Markdown( """ # PDF2PPTX """ ) with gr.Column(): input_file = gr.File(label='Upload your PDF file...', file_count='single', file_types=['.pdf']) num_slides=gr.Slider (label= "Number of slides", interactive=True,minimum=1, maximum=50, value=5, step=1) num_bullets=gr.Slider(label= "Number of bullets per slides", interactive=True, minimum=1, maximum=10, value=5, step=1) fileuploadbtn= gr.Button ("Generate Presentation") presentation = gr.File(label="Your Presentation", interactive=False) gr.Markdown( """ # Responsible AI Usage Your documents uploaded to the system or presentations generated are not saved. """ ) fileuploadbtn.click(fn=generate_presentation, inputs=[input_file, num_slides,num_bullets], outputs=[presentation]) if __name__ == "__main__": demo.launch()