LOUIS SANNA commited on
Commit
bddb702
1 Parent(s): dedb34c

clean(*): add black formatter

Browse files
Files changed (4) hide show
  1. app.py +10 -10
  2. load.py +8 -9
  3. poetry.lock +77 -1
  4. pyproject.toml +3 -0
app.py CHANGED
@@ -3,21 +3,22 @@ from dotenv import load_dotenv
3
  # Load environment variables from .env file
4
  load_dotenv()
5
 
6
- from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
7
- from langchain.vectorstores import Chroma # for the vectorization part
8
  from langchain.chains import ConversationalRetrievalChain
9
- from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
10
  import gradio as gr
11
- from gradio import inputs, outputs
12
- from gradio.mix import Parallel
13
 
14
  max_sources = 4
15
  DB_DIR = "chroma"
16
 
17
  embedding = OpenAIEmbeddings()
18
  vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
19
- pdf_qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
20
- vectordb.as_retriever(), return_source_documents=True)
 
 
 
21
 
22
 
23
  def chat_pdf(query, chat_history=""):
@@ -36,7 +37,6 @@ def chat_pdf(query, chat_history=""):
36
  # Pad the outputs to match the number of output components in the Gradio interface
37
  padded_outputs = [answer] + cleaned_docs + [""] * (max_sources - len(cleaned_docs))
38
  return padded_outputs
39
- return [answer] + cleaned_docs
40
 
41
 
42
  def create_outputs(num_sources):
@@ -55,8 +55,8 @@ iface = gr.Interface(
55
  examples=[
56
  ["Give 2 species of fulgoroidea"],
57
  ["What colors are found among fulgoroidea?"],
58
- ["Why are fulgoroidea so cute?"]
59
  ],
60
  )
61
 
62
- iface.launch(debug=True)
 
3
  # Load environment variables from .env file
4
  load_dotenv()
5
 
6
+ from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
7
+ from langchain.vectorstores import Chroma # for the vectorization part
8
  from langchain.chains import ConversationalRetrievalChain
9
+ from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
10
  import gradio as gr
 
 
11
 
12
  max_sources = 4
13
  DB_DIR = "chroma"
14
 
15
  embedding = OpenAIEmbeddings()
16
  vectordb = Chroma(persist_directory=DB_DIR, embedding_function=embedding)
17
+ pdf_qa = ConversationalRetrievalChain.from_llm(
18
+ OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
19
+ vectordb.as_retriever(),
20
+ return_source_documents=True,
21
+ )
22
 
23
 
24
  def chat_pdf(query, chat_history=""):
 
37
  # Pad the outputs to match the number of output components in the Gradio interface
38
  padded_outputs = [answer] + cleaned_docs + [""] * (max_sources - len(cleaned_docs))
39
  return padded_outputs
 
40
 
41
 
42
  def create_outputs(num_sources):
 
55
  examples=[
56
  ["Give 2 species of fulgoroidea"],
57
  ["What colors are found among fulgoroidea?"],
58
+ ["Why are fulgoroidea so cute?"],
59
  ],
60
  )
61
 
62
+ iface.launch(debug=True)
load.py CHANGED
@@ -3,11 +3,9 @@ from dotenv import load_dotenv
3
  # Load environment variables from .env file
4
  load_dotenv()
5
 
6
- from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
7
- from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
8
- from langchain.vectorstores import Chroma # for the vectorization part
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from glob import glob
13
  import os
@@ -33,9 +31,10 @@ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
33
  documents = text_splitter.split_documents(documents)
34
 
35
  # Now, all_pages contains all the pages from every document
36
- print(f'Total pages: {len(documents)}')
37
 
38
  embeddings = OpenAIEmbeddings()
39
- vectordb = Chroma.from_documents(documents, embedding=embeddings,
40
- persist_directory=DB_DIR)
41
- vectordb.persist()
 
 
3
  # Load environment variables from .env file
4
  load_dotenv()
5
 
6
+ from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
7
+ from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
8
+ from langchain.vectorstores import Chroma # for the vectorization part
 
 
9
  from langchain.text_splitter import CharacterTextSplitter
10
  from glob import glob
11
  import os
 
31
  documents = text_splitter.split_documents(documents)
32
 
33
  # Now, all_pages contains all the pages from every document
34
+ print(f"Total pages: {len(documents)}")
35
 
36
  embeddings = OpenAIEmbeddings()
37
+ vectordb = Chroma.from_documents(
38
+ documents, embedding=embeddings, persist_directory=DB_DIR
39
+ )
40
+ vectordb.persist()
poetry.lock CHANGED
@@ -235,6 +235,54 @@ files = [
235
  {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
236
  ]
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  [[package]]
239
  name = "certifi"
240
  version = "2023.5.7"
@@ -2518,6 +2566,18 @@ sql-other = ["SQLAlchemy (>=1.4.16)"]
2518
  test = ["hypothesis (>=6.34.2)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
2519
  xml = ["lxml (>=4.6.3)"]
2520
 
 
 
 
 
 
 
 
 
 
 
 
 
2521
  [[package]]
2522
  name = "pdfminer-six"
2523
  version = "20221105"
@@ -2619,6 +2679,22 @@ files = [
2619
  docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
2620
  tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
2621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2622
  [[package]]
2623
  name = "posthog"
2624
  version = "3.0.1"
@@ -4251,4 +4327,4 @@ cffi = ["cffi (>=1.11)"]
4251
  [metadata]
4252
  lock-version = "2.0"
4253
  python-versions = "^3.11"
4254
- content-hash = "4e93a19f41c6e64228f70330223c34062b2654aece2bb3f3092dcf095b60ea44"
 
235
  {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
236
  ]
237
 
238
+ [[package]]
239
+ name = "black"
240
+ version = "23.3.0"
241
+ description = "The uncompromising code formatter."
242
+ category = "dev"
243
+ optional = false
244
+ python-versions = ">=3.7"
245
+ files = [
246
+ {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
247
+ {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
248
+ {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
249
+ {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
250
+ {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
251
+ {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
252
+ {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
253
+ {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
254
+ {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
255
+ {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
256
+ {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
257
+ {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
258
+ {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
259
+ {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
260
+ {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
261
+ {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
262
+ {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
263
+ {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
264
+ {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
265
+ {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
266
+ {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
267
+ {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
268
+ {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
269
+ {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
270
+ {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
271
+ ]
272
+
273
+ [package.dependencies]
274
+ click = ">=8.0.0"
275
+ mypy-extensions = ">=0.4.3"
276
+ packaging = ">=22.0"
277
+ pathspec = ">=0.9.0"
278
+ platformdirs = ">=2"
279
+
280
+ [package.extras]
281
+ colorama = ["colorama (>=0.4.3)"]
282
+ d = ["aiohttp (>=3.7.4)"]
283
+ jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
284
+ uvloop = ["uvloop (>=0.15.2)"]
285
+
286
  [[package]]
287
  name = "certifi"
288
  version = "2023.5.7"
 
2566
  test = ["hypothesis (>=6.34.2)", "pytest (>=7.0.0)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
2567
  xml = ["lxml (>=4.6.3)"]
2568
 
2569
+ [[package]]
2570
+ name = "pathspec"
2571
+ version = "0.11.1"
2572
+ description = "Utility library for gitignore style pattern matching of file paths."
2573
+ category = "dev"
2574
+ optional = false
2575
+ python-versions = ">=3.7"
2576
+ files = [
2577
+ {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"},
2578
+ {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
2579
+ ]
2580
+
2581
  [[package]]
2582
  name = "pdfminer-six"
2583
  version = "20221105"
 
2679
  docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"]
2680
  tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"]
2681
 
2682
+ [[package]]
2683
+ name = "platformdirs"
2684
+ version = "3.5.1"
2685
+ description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
2686
+ category = "dev"
2687
+ optional = false
2688
+ python-versions = ">=3.7"
2689
+ files = [
2690
+ {file = "platformdirs-3.5.1-py3-none-any.whl", hash = "sha256:e2378146f1964972c03c085bb5662ae80b2b8c06226c54b2ff4aa9483e8a13a5"},
2691
+ {file = "platformdirs-3.5.1.tar.gz", hash = "sha256:412dae91f52a6f84830f39a8078cecd0e866cb72294a5c66808e74d5e88d251f"},
2692
+ ]
2693
+
2694
+ [package.extras]
2695
+ docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.2.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
2696
+ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
2697
+
2698
  [[package]]
2699
  name = "posthog"
2700
  version = "3.0.1"
 
4327
  [metadata]
4328
  lock-version = "2.0"
4329
  python-versions = "^3.11"
4330
+ content-hash = "5efc01b2243b9e30421de14ff20d9331c548377e049d09cfa59179364f996019"
pyproject.toml CHANGED
@@ -17,6 +17,9 @@ tiktoken = "^0.4.0"
17
  pytesseract = "^0.3.10"
18
 
19
 
 
 
 
20
  [build-system]
21
  requires = ["poetry-core"]
22
  build-backend = "poetry.core.masonry.api"
 
17
  pytesseract = "^0.3.10"
18
 
19
 
20
+ [tool.poetry.group.dev.dependencies]
21
+ black = "^23.3.0"
22
+
23
  [build-system]
24
  requires = ["poetry-core"]
25
  build-backend = "poetry.core.masonry.api"