Spaces:
Sleeping
Sleeping
File size: 2,812 Bytes
bca3a10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import re
from langchain.globals import set_debug
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from chains.map_docs import map_chain
from chains.reduce_docs import reduce_chain
from llm import embedder
set_debug(True)
def load_pdf_url(url):
loader = PyPDFLoader(url)
pages = loader.load()
return pages
def semantic_chunking(pages):
text_splitter = SemanticChunker(embedder, breakpoint_threshold_type='gradient')
docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])])
return docs
def markdown_to_json(markdown):
regex_pattern = r'### Slide (\d+): (.+)\n((- .+\n)+)'
matches = re.findall(regex_pattern, markdown)
slides = []
for match in matches:
slide = {'slide_number': int(match[0]), 'slide_title': match[1], 'points': [{'content': point.lstrip('-').strip(), 'sources': []} for point in match[2].split('\n') if point.strip()]}
slides.append(slide)
return slides
def json_to_html(json):
template = """
<div style="display: flex; gap: 4px 0px; flex-direction: column;">
<div style="align-self: flex-start; font-size: 12px; font-weight: bold; padding: 2px 6px; background-color: #55555555; border-radius: 4px">Slide {slide_no}</div>
<div style="font-size: 16px; font-weight: bold; margin-left: 4px">{slide_title}</div>
<div style="margin-left: 8px">
{points}
</div>
</div>
"""
html = '<div style="display: flex; gap: 24px 0px; flex-direction: column;">'
for slide in json:
points = [f'<li>{point["content"]}</li>' for point in slide['points']]
points = '<ul>' + ''.join(points) + '</ul>'
slide_html = template.format(slide_no=slide['slide_number'], slide_title=slide['slide_title'], points=points)
html += slide_html
html += '</div>'
return html
def json_to_beamer(json):
beamer = """"""
beamer += '```tex\n'
for slide in json:
beamer += f'\\begin{{frame}}{{{slide["slide_title"]}}}\n'
beamer += ' \\begin{itemize}\n'
for point in slide['points']:
beamer += f' \\item {point["content"]}\n'
beamer += ' \\end{itemize}\n'
beamer += '\\end{frame}\n'
beamer += '```\n'
return beamer
async def generate(url):
pages = load_pdf_url(url)
docs = semantic_chunking(pages)
map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={'max_concurrency': len(docs)})
reduce_res = reduce_chain.invoke({'docs': map_res})
json_res = markdown_to_json(reduce_res.content)
html_res = json_to_html(json_res)
beamer_res = json_to_beamer(json_res)
return html_res, beamer_res
|