import re from langchain.globals import set_debug from langchain_community.document_loaders import PyPDFLoader from langchain_experimental.text_splitter import SemanticChunker from chains.map_docs import map_chain from chains.reduce_docs import reduce_chain from llm import embedder set_debug(True) def load_pdf_url(url): loader = PyPDFLoader(url) pages = loader.load() return pages def semantic_chunking(pages): text_splitter = SemanticChunker(embedder, breakpoint_threshold_type='gradient') docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])]) return docs def markdown_to_json(markdown): regex_pattern = r'### Slide (\d+): (.+)\n((- .+\n)+)' matches = re.findall(regex_pattern, markdown) slides = [] for match in matches: slide = {'slide_number': int(match[0]), 'slide_title': match[1], 'points': [{'content': point.lstrip('-').strip(), 'sources': []} for point in match[2].split('\n') if point.strip()]} slides.append(slide) return slides def json_to_html(json): template = """
Slide {slide_no}
{slide_title}
{points}
""" html = '
' for slide in json: points = [f'
  • {point["content"]}
  • ' for point in slide['points']] points = '' slide_html = template.format(slide_no=slide['slide_number'], slide_title=slide['slide_title'], points=points) html += slide_html html += '
    ' return html def json_to_beamer(json): beamer = """""" beamer += '```tex\n' for slide in json: beamer += f'\\begin{{frame}}{{{slide["slide_title"]}}}\n' beamer += ' \\begin{itemize}\n' for point in slide['points']: beamer += f' \\item {point["content"]}\n' beamer += ' \\end{itemize}\n' beamer += '\\end{frame}\n' beamer += '```\n' return beamer async def generate(url): pages = load_pdf_url(url) docs = semantic_chunking(pages) map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={'max_concurrency': len(docs)}) reduce_res = reduce_chain.invoke({'docs': map_res}) json_res = markdown_to_json(reduce_res.content) html_res = json_to_html(json_res) beamer_res = json_to_beamer(json_res) return html_res, beamer_res