Spaces:
Sleeping
Sleeping
| import re | |
| from langchain.globals import set_debug | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_experimental.text_splitter import SemanticChunker | |
| from chains.map_docs import map_chain | |
| from chains.reduce_docs import reduce_chain | |
| from llm import embedder | |
| set_debug(True) | |
| def load_pdf_url(url): | |
| loader = PyPDFLoader(url) | |
| pages = loader.load() | |
| return pages | |
| def semantic_chunking(pages): | |
| text_splitter = SemanticChunker(embedder, breakpoint_threshold_type='gradient') | |
| docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])]) | |
| return docs | |
| def markdown_to_json(markdown): | |
| regex_pattern = r'### Slide (\d+): (.+)\n((- .+\n)+)' | |
| matches = re.findall(regex_pattern, markdown) | |
| slides = [] | |
| for match in matches: | |
| slide = {'slide_number': int(match[0]), 'slide_title': match[1], 'points': [{'content': point.lstrip('-').strip(), 'sources': []} for point in match[2].split('\n') if point.strip()]} | |
| slides.append(slide) | |
| return slides | |
| def json_to_html(json): | |
| template = """ | |
| <div style="display: flex; gap: 4px 0px; flex-direction: column;"> | |
| <div style="align-self: flex-start; font-size: 12px; font-weight: bold; padding: 2px 6px; background-color: #55555555; border-radius: 4px">Slide {slide_no}</div> | |
| <div style="font-size: 16px; font-weight: bold; margin-left: 4px">{slide_title}</div> | |
| <div style="margin-left: 8px"> | |
| {points} | |
| </div> | |
| </div> | |
| """ | |
| html = '<div style="display: flex; gap: 24px 0px; flex-direction: column;">' | |
| for slide in json: | |
| points = [f'<li>{point["content"]}</li>' for point in slide['points']] | |
| points = '<ul>' + ''.join(points) + '</ul>' | |
| slide_html = template.format(slide_no=slide['slide_number'], slide_title=slide['slide_title'], points=points) | |
| html += slide_html | |
| html += '</div>' | |
| return html | |
| def json_to_beamer(json): | |
| beamer = """""" | |
| beamer += '```tex\n' | |
| for slide in json: | |
| beamer += f'\\begin{{frame}}{{{slide["slide_title"]}}}\n' | |
| beamer += ' \\begin{itemize}\n' | |
| for point in slide['points']: | |
| beamer += f' \\item {point["content"]}\n' | |
| beamer += ' \\end{itemize}\n' | |
| beamer += '\\end{frame}\n' | |
| beamer += '```\n' | |
| return beamer | |
| async def generate(url): | |
| pages = load_pdf_url(url) | |
| docs = semantic_chunking(pages) | |
| map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={'max_concurrency': len(docs)}) | |
| reduce_res = reduce_chain.invoke({'docs': map_res}) | |
| json_res = markdown_to_json(reduce_res.content) | |
| html_res = json_to_html(json_res) | |
| beamer_res = json_to_beamer(json_res) | |
| return html_res, beamer_res | |