LOUIS SANNA commited on
Commit
ec0786f
1 Parent(s): aa5f7bd

feat(*): make the whole project run locally

Browse files
Files changed (33) hide show
  1. .gitattributes +3 -0
  2. .gitignore +1 -0
  3. README.md +31 -0
  4. app.py +5 -0
  5. chroma/chroma-embeddings.parquet +2 -2
  6. chroma/chroma-embeddings.parquet.tmp +0 -0
  7. chroma/index/{id_to_uuid_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → id_to_uuid_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +2 -2
  8. chroma/index/{index_63b0b7b3-7308-4629-ba5b-af235fc19082.bin → index_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.bin} +2 -2
  9. chroma/index/{index_metadata_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → index_metadata_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +1 -1
  10. chroma/index/{uuid_to_id_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → uuid_to_id_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +2 -2
  11. data/raw/cixiidae/Fulgoroidea1906-FaunaBritishIndiaCelonBurma-Distant.pdf +3 -0
  12. data/raw/cixiidae/Fulgoroidea1922-NewIndianHomoptera-Muir.pdf +3 -0
  13. data/raw/cixiidae/Fulgoroidea1934-NewLittleKnownFulgoroidea.Muir1934.pdf [missing#581, 587?].pdf +3 -0
  14. data/raw/cixiidae/Fulgoroidea1942-HomoptèresChopardAfriqueOccidentale-Lallemand.pdf +3 -0
  15. data/raw/cixiidae/Fulgoroidea1945-LanterfliesTrinidadSiuthAmerica-Fennah.pdf +3 -0
  16. data/raw/cixiidae/Fulgoroidea1945-PintaliaEquitosaAteson-Fennah.pdf +3 -0
  17. data/raw/cixiidae/Fulgoroidea1952-FauneDeFrance-Ribaut .pdf +3 -0
  18. data/raw/cixiidae/Fulgoroidea1954-CarolineIslands-Metcalf.pdf +3 -0
  19. data/raw/cixiidae/Fulgoroidea1957-DieZikadenAfghanist-Dlabola.pdf +3 -0
  20. data/raw/cixiidae/Fulgoroidea1958-MontNimba-Lallemand.pdf +3 -0
  21. data/raw/cixiidae/Fulgoroidea1965-NewSpeciesWestIndies-Fennah.pdf +3 -0
  22. data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF +3 -0
  23. data/raw/cixiidae/Fulgoroidea1967-New LittleKnownSouthAfrica-Fennah.pdf +3 -0
  24. data/raw/cixiidae/Fulgoroidea1969-NewCaledonia-Fennah.pdf +3 -0
  25. data/raw/cixiidae/Fulgoroidea1982-ScientificResultsMountCameroonExpedition-VanStalle.pdf +3 -0
  26. data/raw/cixiidae/Fulgoroidea1985-EconomicInsectFaunaChina-ChouLuHuangWang.pdf +3 -0
  27. data/raw/cixiidae/Fulgoroidea1985-NewSynonymiesCombinationsNewWorldFulgoroidea-Obrien 1985 3491.pdf +3 -0
  28. data/raw/cixiidae/Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker.pdf +3 -0
  29. data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf +3 -0
  30. example.env +1 -0
  31. load.py +41 -0
  32. poetry.lock +0 -0
  33. pyproject.toml +22 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  chroma filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  chroma filter=lfs diff=lfs merge=lfs -text
36
+ data filter=lfs diff=lfs merge=lfs -text
37
+ data/raw/cixiidae/*.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md CHANGED
@@ -10,3 +10,34 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ### Desciption
16
+
17
+ This project use LLM to interact with documents describing fulgoroidea.
18
+
19
+ ### Quick start
20
+
21
+ This project use poetry.
22
+
23
+ To install packages
24
+
25
+ ```shell
26
+ poetry install
27
+ ```
28
+
29
+ To lauch costum env:
30
+ ```shell
31
+ poetry shell
32
+ ```
33
+
34
+ Then to lauch the app:
35
+
36
+ ```shell
37
+ python app.py
38
+ ````
39
+
40
+ App can be found at http://127.0.0.1:7860/
41
+
42
+
43
+
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
2
  from langchain.vectorstores import Chroma # for the vectorization part
3
  from langchain.chains import ConversationalRetrievalChain
 
1
+ from dotenv import load_dotenv
2
+
3
+ # Load environment variables from .env file
4
+ load_dotenv()
5
+
6
  from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
7
  from langchain.vectorstores import Chroma # for the vectorization part
8
  from langchain.chains import ConversationalRetrievalChain
chroma/chroma-embeddings.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccbf54215927a71ebb20809761fae745307f060abc93e49b48064750b5fcba1b
3
- size 45295396
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86ce62daaa8745aef49791108519965e063de1a1caa0611fefba9915330a8d38
3
+ size 88840691
chroma/chroma-embeddings.parquet.tmp ADDED
File without changes
chroma/index/{id_to_uuid_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → id_to_uuid_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eba841025944f648bbeadabfcc703e1d3ba869e94bb188cf036f1476f41a3ff9
3
- size 115799
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d1b4a85efa0adb626b6bef79faec65538f47da093d42fa4af15e6ad7bf8662
3
+ size 116066
chroma/index/{index_63b0b7b3-7308-4629-ba5b-af235fc19082.bin → index_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.bin} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8aec36f0d959e3241c04693e550f7f26514c27894c58d9186e2b38d870f2079
3
- size 22465256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b312b85e2e2ba964451568d56a9235dd5f990451d8272ea5f2f72bda972c2d8
3
+ size 22514608
chroma/index/{index_metadata_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → index_metadata_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:200b2c59698f3cd951e3fbb0c0be1adb214d5b471c5e9304e761f6ec8a4b546e
3
  size 74
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08928359a065acd02acc0dd47b309a4a5cf7716dfad604edb06a3a52de955544
3
  size 74
chroma/index/{uuid_to_id_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → uuid_to_id_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2261ce355c76b03fb84877e970454e3f80252fcb19f1491b9f3ddf1500e984eb
3
- size 135445
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92b9ed8b3ea549ce344b1171b5f50c3f636d33c7f58f6da5df0848c44e0f8cbb
3
+ size 135749
data/raw/cixiidae/Fulgoroidea1906-FaunaBritishIndiaCelonBurma-Distant.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47374edb165ec953eceab55609918f6e4585c20960c42ad3f68374901c4930a3
3
+ size 38324614
data/raw/cixiidae/Fulgoroidea1922-NewIndianHomoptera-Muir.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba42c7d09d4c5454270c50dc758edb295fccef72dbc8511670321256562c77a3
3
+ size 3574936
data/raw/cixiidae/Fulgoroidea1934-NewLittleKnownFulgoroidea.Muir1934.pdf [missing#581, 587?].pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55dd9ceeefdcb184212b50a74760db0f067ebd24084f3221a846d24a9b67d232
3
+ size 436216
data/raw/cixiidae/Fulgoroidea1942-HomoptèresChopardAfriqueOccidentale-Lallemand.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fdb1a26498d804939d71b7ab325b57be3f3b560d246dad3a8b2b816c4a9e71f
3
+ size 188673
data/raw/cixiidae/Fulgoroidea1945-LanterfliesTrinidadSiuthAmerica-Fennah.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:000a43fb2e9d1560664f25b4a99038e2b0cb22b81e3be728198e981b13591a47
3
+ size 62543484
data/raw/cixiidae/Fulgoroidea1945-PintaliaEquitosaAteson-Fennah.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50efdb4949fde4efeefc915512af757b2a59ba91b261c5fd6fbd105fc62edc00
3
+ size 818526
data/raw/cixiidae/Fulgoroidea1952-FauneDeFrance-Ribaut .pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247c3656080830a0b64765b3bd793ca0d8bd7d892970dd36fa2a5ae8cdab0976
3
+ size 43546668
data/raw/cixiidae/Fulgoroidea1954-CarolineIslands-Metcalf.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d59893291595ff0c2a3f33706e234d94c3fab35ddc0243a043e75a887ba0005a
3
+ size 3327485
data/raw/cixiidae/Fulgoroidea1957-DieZikadenAfghanist-Dlabola.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94ad30ec6e163bf204c0ebe6093453509c6a9a552a12d9e3d7492f31af88607e
3
+ size 11136356
data/raw/cixiidae/Fulgoroidea1958-MontNimba-Lallemand.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3b8395eb2aff419b9e660e1bc3026c0e9f48e7e3e9699d3d6f928834c242fd9
3
+ size 621407
data/raw/cixiidae/Fulgoroidea1965-NewSpeciesWestIndies-Fennah.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:654a6efbbe8c6b743682a479cdf1b0a0d674d1b5a8e747d4a7e8eef192a912c4
3
+ size 1557342
data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6d6509361be6c9926f0f94d58e609fe882b8f290c8666df8e6466fcb50cac0
3
+ size 1562170
data/raw/cixiidae/Fulgoroidea1967-New LittleKnownSouthAfrica-Fennah.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cb0394f0d7215d88fd2bc8eb4c52ec2a5d9cba3cb314a29d9bc8d68167c96c0
3
+ size 2656903
data/raw/cixiidae/Fulgoroidea1969-NewCaledonia-Fennah.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae6d0db9b5d3d1fa63a2d5d8b7e5c3e5a9723a6da8e6da479930d1b6a53c1d3
3
+ size 12344035
data/raw/cixiidae/Fulgoroidea1982-ScientificResultsMountCameroonExpedition-VanStalle.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c09eecd6eeb8b6e746ce81d358faec03f6f2d7acd64d4d6b828834ec6932afb4
3
+ size 5030611
data/raw/cixiidae/Fulgoroidea1985-EconomicInsectFaunaChina-ChouLuHuangWang.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062434ec1f6eba2f1822a061aa25c731e25d80b66f8f88eb84d09159df926a35
3
+ size 50565258
data/raw/cixiidae/Fulgoroidea1985-NewSynonymiesCombinationsNewWorldFulgoroidea-Obrien 1985 3491.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38fa7c85f06fe661e56979493b1df3dd0c142cbfcc4dc9a2f5d797bab1b24389
3
+ size 763298
data/raw/cixiidae/Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61dd802becbb724794a73e63b6fb38ba6bec931b199dfd5de67cc0a0fc6c43cc
3
+ size 155971
data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b45f8118894317bfc092afdf4ecc91cbddcb6e00710725a875d5c293dd4957c5
3
+ size 465630
example.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=
load.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+
3
+ # Load environment variables from .env file
4
+ load_dotenv()
5
+
6
+ from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
7
+ from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
8
+ from langchain.vectorstores import Chroma # for the vectorization part
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
11
+ from langchain.text_splitter import CharacterTextSplitter
12
+ from glob import glob
13
+ import os
14
+
15
+ DOCUMENT_PATH = "data/raw/cixiidae"
16
+ DB_DIR = "chroma"
17
+
18
+ pdf_files = glob(os.path.join(DOCUMENT_PATH, "*.pdf"))
19
+ documents = []
20
+
21
+ # Iterate through the list of PDF files
22
+ for file_path in pdf_files:
23
+ try:
24
+ loader = UnstructuredFileLoader(file_path)
25
+ document = loader.load()
26
+ documents.extend(document)
27
+ print(f"File added: {file_path}")
28
+
29
+ except Exception as e:
30
+ print(f"An error occurred while processing the file {file_path}: {str(e)}")
31
+
32
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
33
+ documents = text_splitter.split_documents(documents)
34
+
35
+ # Now, all_pages contains all the pages from every document
36
+ print(f'Total pages: {len(documents)}')
37
+
38
+ embeddings = OpenAIEmbeddings()
39
+ vectordb = Chroma.from_documents(documents, embedding=embeddings,
40
+ persist_directory=DB_DIR)
41
+ vectordb.persist()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "datak"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["LOUIS SANNA <louissanna@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.11"
10
+ openai = "^0.27.6"
11
+ langchain = "^0.0.161"
12
+ chromadb = "^0.3.21"
13
+ gradio = "^3.28.3"
14
+ python-dotenv = "^1.0.0"
15
+ unstructured = "^0.6.3"
16
+ tiktoken = "^0.4.0"
17
+ pytesseract = "^0.3.10"
18
+
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"