tykimseoul commited on
Commit
bca3a10
1 Parent(s): 2759b95

Upload folder using huggingface_hub

Browse files
.github/workflows/sync_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: "3.9"
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/share
2
+ .env
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: PaperDeck
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: indigo
 
6
  sdk: gradio
7
  sdk_version: 4.37.2
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
1
  ---
2
  title: PaperDeck
3
+ app_file: main.py
4
+ emoji: 😻
5
+ colorFrom: pink
6
+ colorTo: green
7
  sdk: gradio
8
  sdk_version: 4.37.2
 
9
  pinned: false
10
  ---
11
 
chains/map_docs.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+ from langchain_core.runnables import RunnablePassthrough
3
+
4
+ from llm import gpt3
5
+
6
+ map_template = """You are an expert in technical papers and journals.
7
+ You're tasked with summarizing the main points in the following text.
8
+ The following is the text you need to summarize:
9
+ {doc}
10
+ Based on this text, provide a summary of the main points.
11
+
12
+ RULES:
13
+ - Organize the points in markdown format.
14
+
15
+ Helpful Answer:
16
+ """
17
+
18
+ map_prompt = PromptTemplate.from_template(map_template)
19
+ map_chain = {'doc': RunnablePassthrough()} | map_prompt | gpt3
chains/reduce_docs.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+ from langchain_core.runnables import RunnablePassthrough
3
+
4
+ from llm import gpt3
5
+
6
+
7
+ def concat_summaries(docs):
8
+ return '\n\n'.join(doc.content for doc in docs['docs'])
9
+
10
+
11
+ # Reduce
12
+ reduce_template = """The following is set of summaries of a technical paper:
13
+ {docs}
14
+
15
+ Take these and distill it into a final, consolidated summary of the main points.
16
+
17
+ RULES:
18
+ - The summary should be as if you are presenting the paper in a seminar.
19
+ - The outline should include common sections of a technical seminar.
20
+ - Organize the points in powerpoint slide format.
21
+ - Use markdown to format the text.
22
+ - Each point may be technical.
23
+ - You may have as many points as you need.
24
+
25
+ Each slide should follow the following format:
26
+ ### Slide 2: Slide title
27
+ - point 1
28
+ - point 2
29
+
30
+ Helpful Answer:
31
+ """
32
+
33
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
34
+
35
+ # Run chain
36
+ reduce_chain = {'docs': RunnablePassthrough() | concat_summaries} | reduce_prompt | gpt3
generate.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from langchain.globals import set_debug
4
+ from langchain_community.document_loaders import PyPDFLoader
5
+ from langchain_experimental.text_splitter import SemanticChunker
6
+
7
+ from chains.map_docs import map_chain
8
+ from chains.reduce_docs import reduce_chain
9
+ from llm import embedder
10
+
11
+ set_debug(True)
12
+
13
+
14
+ def load_pdf_url(url):
15
+ loader = PyPDFLoader(url)
16
+ pages = loader.load()
17
+ return pages
18
+
19
+
20
+ def semantic_chunking(pages):
21
+ text_splitter = SemanticChunker(embedder, breakpoint_threshold_type='gradient')
22
+ docs = text_splitter.create_documents([' '.join([page.page_content for page in pages])])
23
+ return docs
24
+
25
+
26
+ def markdown_to_json(markdown):
27
+ regex_pattern = r'### Slide (\d+): (.+)\n((- .+\n)+)'
28
+ matches = re.findall(regex_pattern, markdown)
29
+ slides = []
30
+ for match in matches:
31
+ slide = {'slide_number': int(match[0]), 'slide_title': match[1], 'points': [{'content': point.lstrip('-').strip(), 'sources': []} for point in match[2].split('\n') if point.strip()]}
32
+ slides.append(slide)
33
+ return slides
34
+
35
+
36
+ def json_to_html(json):
37
+ template = """
38
+ <div style="display: flex; gap: 4px 0px; flex-direction: column;">
39
+ <div style="align-self: flex-start; font-size: 12px; font-weight: bold; padding: 2px 6px; background-color: #55555555; border-radius: 4px">Slide {slide_no}</div>
40
+ <div style="font-size: 16px; font-weight: bold; margin-left: 4px">{slide_title}</div>
41
+ <div style="margin-left: 8px">
42
+ {points}
43
+ </div>
44
+ </div>
45
+ """
46
+ html = '<div style="display: flex; gap: 24px 0px; flex-direction: column;">'
47
+ for slide in json:
48
+ points = [f'<li>{point["content"]}</li>' for point in slide['points']]
49
+ points = '<ul>' + ''.join(points) + '</ul>'
50
+ slide_html = template.format(slide_no=slide['slide_number'], slide_title=slide['slide_title'], points=points)
51
+ html += slide_html
52
+ html += '</div>'
53
+ return html
54
+
55
+
56
+ def json_to_beamer(json):
57
+ beamer = """"""
58
+ beamer += '```tex\n'
59
+ for slide in json:
60
+ beamer += f'\\begin{{frame}}{{{slide["slide_title"]}}}\n'
61
+ beamer += ' \\begin{itemize}\n'
62
+ for point in slide['points']:
63
+ beamer += f' \\item {point["content"]}\n'
64
+ beamer += ' \\end{itemize}\n'
65
+ beamer += '\\end{frame}\n'
66
+ beamer += '```\n'
67
+ return beamer
68
+
69
+
70
+ async def generate(url):
71
+ pages = load_pdf_url(url)
72
+ docs = semantic_chunking(pages)
73
+ map_res = await map_chain.abatch([{'doc': doc.page_content} for doc in docs], config={'max_concurrency': len(docs)})
74
+ reduce_res = reduce_chain.invoke({'docs': map_res})
75
+ json_res = markdown_to_json(reduce_res.content)
76
+ html_res = json_to_html(json_res)
77
+ beamer_res = json_to_beamer(json_res)
78
+ return html_res, beamer_res
llm.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
5
+
6
+ load_dotenv()
7
+
8
+ gpt3 = ChatOpenAI(model='gpt-3.5-turbo-0125', openai_api_key=os.environ['OPENAI_KEY'])
9
+ embedder = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=os.environ['OPENAI_KEY'])
main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ from dotenv import load_dotenv
5
+
6
+ from generate import generate
7
+
8
+ load_dotenv()
9
+ password = os.environ['PASSWORD']
10
+
11
+
12
+ async def run(initials, pdf_url, pdf_file):
13
+ if initials != password:
14
+ return '권한이 없습니다.', ''
15
+ url = pdf_url or pdf_file
16
+ res = await generate(url)
17
+ return res
18
+
19
+ theme = gr.themes.Base()
20
+
21
+
22
+ def main():
23
+ with gr.Blocks(theme=theme, title='PaperDeck') as demo:
24
+ gr.Markdown('# ')
25
+ with gr.Row():
26
+ with gr.Column(scale=5):
27
+ gr.Markdown('# PaperDeck2')
28
+ with gr.Row():
29
+ with gr.Column(scale=1, min_width=300):
30
+ gr.Markdown('# ')
31
+ gr.Markdown('## 업로드')
32
+ with gr.Group():
33
+ password_input = gr.Textbox(max_lines=1, placeholder='이니셜을 입력하세요.', show_label=False)
34
+ url_input = gr.Textbox(max_lines=1, placeholder='논문의 URL을 입력하세요.', show_label=False)
35
+ pdf_file = gr.File(show_label=False)
36
+ submit_button = gr.Button(value='실행', variant='primary')
37
+ with gr.Column(scale=3, min_width=600):
38
+ gr.Markdown('# ')
39
+ gr.Markdown('## 결과')
40
+ with gr.Tab('Preview'):
41
+ preview_html = gr.HTML()
42
+ with gr.Tab('Beamer'):
43
+ beamer_markdown = gr.Markdown()
44
+ submit_button.click(fn=run, inputs=[password_input, url_input, pdf_file], outputs=[preview_html, beamer_markdown])
45
+
46
+ demo.launch()
47
+
48
+
49
+ if __name__ == '__main__':
50
+ # launch gradio application
51
+ main()
requirements.txt ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ altair==5.3.0
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ appnope==0.1.4
8
+ asttokens==2.4.1
9
+ async-timeout==4.0.3
10
+ attrs==23.2.0
11
+ certifi==2024.6.2
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ comm==0.2.2
15
+ contourpy==1.2.1
16
+ cycler==0.12.1
17
+ dataclasses-json==0.6.7
18
+ debugpy==1.8.2
19
+ decorator==5.1.1
20
+ distro==1.9.0
21
+ dnspython==2.6.1
22
+ email_validator==2.2.0
23
+ exceptiongroup==1.2.1
24
+ executing==2.0.1
25
+ faiss-cpu==1.8.0.post1
26
+ fastapi==0.111.0
27
+ fastapi-cli==0.0.4
28
+ ffmpy==0.3.2
29
+ filelock==3.15.4
30
+ fonttools==4.53.0
31
+ frozenlist==1.4.1
32
+ fsspec==2024.6.1
33
+ gradio==4.37.2
34
+ gradio_client==1.0.2
35
+ h11==0.14.0
36
+ httpcore==1.0.5
37
+ httptools==0.6.1
38
+ httpx==0.27.0
39
+ huggingface-hub==0.23.4
40
+ idna==3.7
41
+ importlib_metadata==8.0.0
42
+ importlib_resources==6.4.0
43
+ ipykernel==6.29.4
44
+ ipython==8.18.1
45
+ jedi==0.19.1
46
+ Jinja2==3.1.4
47
+ jsonpatch==1.33
48
+ jsonpointer==3.0.0
49
+ jsonschema==4.22.0
50
+ jsonschema-specifications==2023.12.1
51
+ jupyter_client==8.6.2
52
+ jupyter_core==5.7.2
53
+ kiwisolver==1.4.5
54
+ langchain==0.2.6
55
+ langchain-community==0.2.6
56
+ langchain-core==0.2.10
57
+ langchain-experimental==0.0.62
58
+ langchain-openai==0.1.13
59
+ langchain-text-splitters==0.2.2
60
+ langsmith==0.1.82
61
+ markdown-it-py==3.0.0
62
+ MarkupSafe==2.1.5
63
+ marshmallow==3.21.3
64
+ matplotlib==3.9.0
65
+ matplotlib-inline==0.1.7
66
+ mdurl==0.1.2
67
+ multidict==6.0.5
68
+ mypy-extensions==1.0.0
69
+ nest-asyncio==1.6.0
70
+ numpy==1.26.4
71
+ openai==1.35.7
72
+ orjson==3.10.5
73
+ packaging==24.1
74
+ pandas==2.2.2
75
+ parso==0.8.4
76
+ pexpect==4.9.0
77
+ pillow==10.3.0
78
+ platformdirs==4.2.2
79
+ prompt_toolkit==3.0.47
80
+ psutil==6.0.0
81
+ ptyprocess==0.7.0
82
+ pure-eval==0.2.2
83
+ pydantic==2.7.4
84
+ pydantic_core==2.18.4
85
+ pydub==0.25.1
86
+ Pygments==2.18.0
87
+ pyparsing==3.1.2
88
+ pypdf==4.2.0
89
+ python-dateutil==2.9.0.post0
90
+ python-dotenv==1.0.1
91
+ python-multipart==0.0.9
92
+ pytz==2024.1
93
+ PyYAML==6.0.1
94
+ pyzmq==26.0.3
95
+ referencing==0.35.1
96
+ regex==2024.5.15
97
+ requests==2.32.3
98
+ rich==13.7.1
99
+ rpds-py==0.18.1
100
+ ruff==0.5.0
101
+ semantic-version==2.10.0
102
+ shellingham==1.5.4
103
+ six==1.16.0
104
+ sniffio==1.3.1
105
+ SQLAlchemy==2.0.31
106
+ stack-data==0.6.3
107
+ starlette==0.37.2
108
+ tenacity==8.4.2
109
+ tiktoken==0.7.0
110
+ tomlkit==0.12.0
111
+ toolz==0.12.1
112
+ tornado==6.4.1
113
+ tqdm==4.66.4
114
+ traitlets==5.14.3
115
+ typer==0.12.3
116
+ typing-inspect==0.9.0
117
+ typing_extensions==4.12.2
118
+ tzdata==2024.1
119
+ ujson==5.10.0
120
+ urllib3==2.2.2
121
+ uvicorn==0.30.1
122
+ uvloop==0.19.0
123
+ watchfiles==0.22.0
124
+ wcwidth==0.2.13
125
+ websockets==11.0.3
126
+ yarl==1.9.4
127
+ zipp==3.19.2
ruff.toml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Exclude a variety of commonly ignored directories.
2
+ exclude = [
3
+ ".bzr",
4
+ ".direnv",
5
+ ".eggs",
6
+ ".git",
7
+ ".git-rewrite",
8
+ ".hg",
9
+ ".ipynb_checkpoints",
10
+ ".mypy_cache",
11
+ ".nox",
12
+ ".pants.d",
13
+ ".pyenv",
14
+ ".pytest_cache",
15
+ ".pytype",
16
+ ".ruff_cache",
17
+ ".svn",
18
+ ".tox",
19
+ ".venv",
20
+ ".vscode",
21
+ "__pypackages__",
22
+ "_build",
23
+ "buck-out",
24
+ "build",
25
+ "dist",
26
+ "node_modules",
27
+ "site-packages",
28
+ "venv",
29
+ ]
30
+
31
+ # Same as Black.
32
+ line-length = 320
33
+ indent-width = 4
34
+
35
+ # Assume Python 3.8
36
+ target-version = "py38"
37
+
38
+ [lint]
39
+ # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
40
+ # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
41
+ # McCabe complexity (`C901`) by default.
42
+ select = ['E', 'F', 'W', 'A', 'PLC', 'PLE', 'PLW', 'I', 'ASYNC', 'PT', 'C4', 'B']
43
+ ignore = ["B008", "B023", "B006"]
44
+
45
+ # Allow fix for all enabled rules (when `--fix`) is provided.
46
+ fixable = ["ALL"]
47
+ unfixable = []
48
+
49
+ # Allow unused variables when underscore-prefixed.
50
+ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
51
+
52
+ [format]
53
+ # Like Black, use double quotes for strings.
54
+ quote-style = "single"
55
+
56
+ # Like Black, indent with spaces, rather than tabs.
57
+ indent-style = "space"
58
+
59
+ # Like Black, respect magic trailing commas.
60
+ skip-magic-trailing-comma = false
61
+
62
+ # Like Black, automatically detect the appropriate line ending.
63
+ line-ending = "auto"
64
+
65
+ # Enable auto-formatting of code examples in docstrings. Markdown,
66
+ # reStructuredText code/literal blocks and doctests are all supported.
67
+ #
68
+ # This is currently disabled by default, but it is planned for this
69
+ # to be opt-out in the future.
70
+ docstring-code-format = false
71
+
72
+ # Set the line length limit used when formatting code snippets in
73
+ # docstrings.
74
+ #
75
+ # This only has an effect when the `docstring-code-format` setting is
76
+ # enabled.
77
+ docstring-code-line-length = "dynamic"
78
+
79
+ [lint.isort]
80
+ combine-as-imports = true
venv/pyvenv.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ home = /Library/Developer/CommandLineTools/usr/bin
2
+ include-system-site-packages = false
3
+ version = 3.9.6