Spaces:

tykimseoul
/

PaperDeck

Sleeping

App Files Files Community

tykimseoul commited on 25 days ago

Commit

bca3a10

•

1 Parent(s): 2759b95

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.github/workflows/sync_space.yml +28 -0
.gitignore +2 -0
README.md +4 -4
chains/map_docs.py +19 -0
chains/reduce_docs.py +36 -0
generate.py +78 -0
llm.py +9 -0
main.ipynb +0 -0
main.py +51 -0
requirements.txt +127 -0
ruff.toml +80 -0
venv/pyvenv.cfg +3 -0

.github/workflows/sync_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.9"
+      - name: Install Gradio
+        run: python -m pip install gradio
+      - name: Log in to Hugging Face
+        run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+      - name: Deploy to Spaces
+        run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ venv/share
2	+ .env

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 title: PaperDeck
-emoji: 🔥
-colorFrom: purple
-colorTo: indigo
 sdk: gradio
 sdk_version: 4.37.2
-app_file: app.py
 pinned: false
 ---

 ---
 title: PaperDeck
+app_file: main.py
+emoji: 😻
+colorFrom: pink
+colorTo: green
 sdk: gradio
 sdk_version: 4.37.2
 pinned: false
 ---

chains/map_docs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from llm import gpt3
+map_template = """You are an expert in technical papers and journals.
+You're tasked with summarizing the main points in the following text.
+The following is the text you need to summarize:
+{doc}
+Based on this text, provide a summary of the main points.
+RULES:
+- Organize the points in markdown format.
+Helpful Answer:
+"""
+map_prompt = PromptTemplate.from_template(map_template)
+map_chain = {'doc': RunnablePassthrough()} | map_prompt | gpt3

chains/reduce_docs.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from langchain_core.prompts import PromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from llm import gpt3
+def concat_summaries(docs):
+    return '\n\n'.join(doc.content for doc in docs['docs'])
+# Reduce
+reduce_template = """The following is set of summaries of a technical paper:
+{docs}
+Take these and distill it into a final, consolidated summary of the main points.
+RULES:
+- The summary should be as if you are presenting the paper in a seminar.
+- The outline should include common sections of a technical seminar.
+- Organize the points in powerpoint slide format.
+- Use markdown to format the text.
+- Each point may be technical.
+- You may have as many points as you need.
+Each slide should follow the following format:
+### Slide 2: Slide title
+- point 1
+- point 2
+Helpful Answer:
+"""
+reduce_prompt = PromptTemplate.from_template(reduce_template)
+# Run chain
+reduce_chain = {'docs': RunnablePassthrough() | concat_summaries} | reduce_prompt | gpt3

generate.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import re
+from langchain.globals import set_debug
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_experimental.text_splitter import SemanticChunker
+from chains.map_docs import map_chain
+from chains.reduce_docs import reduce_chain
+from llm import embedder
+set_debug(True)
+def load_pdf_url(url):
+    loader = PyPDFLoader(url)
+    pages = loader.load()
+    return pages
+def semantic_chunking(pages):
+    text_splitter = SemanticChunker(embedder, breakpoint_threshold_type='gradient')
+    docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])])
+    return docs
+def markdown_to_json(markdown):
+    regex_pattern = r'### Slide (\d+): (.+)\n((- .+\n)+)'
+    matches = re.findall(regex_pattern, markdown)
+    slides = []
+    for match in matches:
+        slide = {'slide_number': int(match[0]), 'slide_title': match[1], 'points': [{'content': point.lstrip('-').strip(), 'sources': []} for point in match[2].split('\n') if point.strip()]}
+        slides.append(slide)
+    return slides
+def json_to_html(json):
+    template = """
+        <div style="display: flex; gap: 4px 0px; flex-direction: column;">
+            <div style="align-self: flex-start; font-size: 12px; font-weight: bold; padding: 2px 6px; background-color: #55555555; border-radius: 4px">Slide {slide_no}</div>
+            <div style="font-size: 16px; font-weight: bold; margin-left: 4px">{slide_title}</div>
+            <div style="margin-left: 8px">
+                {points}
+            </div>
+        </div>
+    """
+    html = '<div style="display: flex; gap: 24px 0px; flex-direction: column;">'
+    for slide in json:
+        points = [f'<li>{point["content"]}</li>' for point in slide['points']]
+        points = '<ul>' + ''.join(points) + '</ul>'
+        slide_html = template.format(slide_no=slide['slide_number'], slide_title=slide['slide_title'], points=points)
+        html += slide_html
+    html += '</div>'
+    return html
+def json_to_beamer(json):
+    beamer = """"""
+    beamer += '```tex\n'
+    for slide in json:
+        beamer += f'\\begin{{frame}}{{{slide["slide_title"]}}}\n'
+        beamer += '  \\begin{itemize}\n'
+        for point in slide['points']:
+            beamer += f'    \\item {point["content"]}\n'
+        beamer += '  \\end{itemize}\n'
+        beamer += '\\end{frame}\n'
+    beamer += '```\n'
+    return beamer
+async def generate(url):
+    pages = load_pdf_url(url)
+    docs = semantic_chunking(pages)
+    map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={'max_concurrency': len(docs)})
+    reduce_res = reduce_chain.invoke({'docs': map_res})
+    json_res = markdown_to_json(reduce_res.content)
+    html_res = json_to_html(json_res)
+    beamer_res = json_to_beamer(json_res)
+    return html_res, beamer_res

llm.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+load_dotenv()
+gpt3 = ChatOpenAI(model='gpt-3.5-turbo-0125', openai_api_key=os.environ['OPENAI_KEY'])
+embedder = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ['OPENAI_KEY'])

main.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import gradio as gr
+from dotenv import load_dotenv
+from generate import generate
+load_dotenv()
+password = os.environ['PASSWORD']
+async def run(initials, pdf_url, pdf_file):
+    if initials != password:
+        return '권한이 없습니다.', ''
+    url = pdf_url or pdf_file
+    res = await generate(url)
+    return res
+theme = gr.themes.Base()
+def main():
+    with gr.Blocks(theme=theme, title='PaperDeck') as demo:
+        gr.Markdown('# ')
+        with gr.Row():
+            with gr.Column(scale=5):
+                gr.Markdown('# PaperDeck2')
+        with gr.Row():
+            with gr.Column(scale=1, min_width=300):
+                gr.Markdown('# ')
+                gr.Markdown('## 업로드')
+                with gr.Group():
+                    password_input = gr.Textbox(max_lines=1, placeholder='이니셜을 입력하세요.', show_label=False)
+                    url_input = gr.Textbox(max_lines=1, placeholder='논문의 URL을 입력하세요.', show_label=False)
+                    pdf_file = gr.File(show_label=False)
+                    submit_button = gr.Button(value='실행', variant='primary')
+            with gr.Column(scale=3, min_width=600):
+                gr.Markdown('# ')
+                gr.Markdown('## 결과')
+                with gr.Tab('Preview'):
+                    preview_html = gr.HTML()
+                with gr.Tab('Beamer'):
+                    beamer_markdown = gr.Markdown()
+        submit_button.click(fn=run, inputs=[password_input, url_input, pdf_file], outputs=[preview_html, beamer_markdown])
+    demo.launch()
+if __name__ == '__main__':
+    # launch gradio application
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,127 @@

+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+appnope==0.1.4
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==23.2.0
+certifi==2024.6.2
+charset-normalizer==3.3.2
+click==8.1.7
+comm==0.2.2
+contourpy==1.2.1
+cycler==0.12.1
+dataclasses-json==0.6.7
+debugpy==1.8.2
+decorator==5.1.1
+distro==1.9.0
+dnspython==2.6.1
+email_validator==2.2.0
+exceptiongroup==1.2.1
+executing==2.0.1
+faiss-cpu==1.8.0.post1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+ffmpy==0.3.2
+filelock==3.15.4
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.1
+gradio==4.37.2
+gradio_client==1.0.2
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+idna==3.7
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+ipykernel==6.29.4
+ipython==8.18.1
+jedi==0.19.1
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+kiwisolver==1.4.5
+langchain==0.2.6
+langchain-community==0.2.6
+langchain-core==0.2.10
+langchain-experimental==0.0.62
+langchain-openai==0.1.13
+langchain-text-splitters==0.2.2
+langsmith==0.1.82
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.3
+matplotlib==3.9.0
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+multidict==6.0.5
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+numpy==1.26.4
+openai==1.35.7
+orjson==3.10.5
+packaging==24.1
+pandas==2.2.2
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.3.0
+platformdirs==4.2.2
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pydantic==2.7.4
+pydantic_core==2.18.4
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+pypdf==4.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+pyzmq==26.0.3
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.5.0
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.31
+stack-data==0.6.3
+starlette==0.37.2
+tenacity==8.4.2
+tiktoken==0.7.0
+tomlkit==0.12.0
+toolz==0.12.1
+tornado==6.4.1
+tqdm==4.66.4
+traitlets==5.14.3
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.2
+uvicorn==0.30.1
+uvloop==0.19.0
+watchfiles==0.22.0
+wcwidth==0.2.13
+websockets==11.0.3
+yarl==1.9.4
+zipp==3.19.2

ruff.toml ADDED Viewed

	@@ -0,0 +1,80 @@

+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pyenv",
+    ".pytest_cache",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    ".vscode",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "site-packages",
+    "venv",
+]
+# Same as Black.
+line-length = 320
+indent-width = 4
+# Assume Python 3.8
+target-version = "py38"
+[lint]
+# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
+# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
+# McCabe complexity (`C901`) by default.
+select = ['E', 'F', 'W', 'A', 'PLC', 'PLE', 'PLW', 'I', 'ASYNC', 'PT', 'C4', 'B']
+ignore = ["B008", "B023", "B006"]
+# Allow fix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+[format]
+# Like Black, use double quotes for strings.
+quote-style = "single"
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
+# Enable auto-formatting of code examples in docstrings. Markdown,
+# reStructuredText code/literal blocks and doctests are all supported.
+#
+# This is currently disabled by default, but it is planned for this
+# to be opt-out in the future.
+docstring-code-format = false
+# Set the line length limit used when formatting code snippets in
+# docstrings.
+#
+# This only has an effect when the `docstring-code-format` setting is
+# enabled.
+docstring-code-line-length = "dynamic"
+[lint.isort]
+combine-as-imports = true

venv/pyvenv.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+home = /Library/Developer/CommandLineTools/usr/bin
+include-system-site-packages = false
+version = 3.9.6