rianders commited on
Commit
5c8cbfc
1 Parent(s): c4a4127

fixed dependency generation

Browse files
app.py ADDED
File without changes
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "mpi-data-store"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["ricklon <rianders@docs.rutgers.edu>"]
6
+ readme = "README.md"
7
+ packages = [{include = "mpi_data_store"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.10"
11
+ langchain = "^0.1.13"
12
+ pypdf = "^4.1.0"
13
+ langchain-community = "^0.0.29"
14
+ sentence-transformers = "^2.6.0"
15
+ faiss-cpu = "^1.8.0"
16
+ pandas = "^2.2.1"
17
+ tqdm = "^4.66.2"
18
+ streamlit = "^1.32.2"
19
+
20
+
21
+ [build-system]
22
+ requires = ["poetry-core"]
23
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.3 ; python_version >= "3.10" and python_version < "4.0"
2
+ aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
3
+ altair==5.2.0 ; python_version >= "3.10" and python_version < "4.0"
4
+ annotated-types==0.6.0 ; python_version >= "3.10" and python_version < "4.0"
5
+ anyio==4.3.0 ; python_version >= "3.10" and python_version < "4.0"
6
+ async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11"
7
+ attrs==23.2.0 ; python_version >= "3.10" and python_version < "4.0"
8
+ blinker==1.7.0 ; python_version >= "3.10" and python_version < "4.0"
9
+ cachetools==5.3.3 ; python_version >= "3.10" and python_version < "4.0"
10
+ certifi==2024.2.2 ; python_version >= "3.10" and python_version < "4.0"
11
+ charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0"
12
+ click==8.1.7 ; python_version >= "3.10" and python_version < "4.0"
13
+ colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" and platform_system == "Windows"
14
+ dataclasses-json==0.6.4 ; python_version >= "3.10" and python_version < "4.0"
15
+ exceptiongroup==1.2.0 ; python_version >= "3.10" and python_version < "3.11"
16
+ faiss-cpu==1.8.0 ; python_version >= "3.10" and python_version < "4.0"
17
+ filelock==3.13.1 ; python_version >= "3.10" and python_version < "4.0"
18
+ frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0"
19
+ fsspec==2024.3.1 ; python_version >= "3.10" and python_version < "4.0"
20
+ gitdb==4.0.11 ; python_version >= "3.10" and python_version < "4.0"
21
+ gitpython==3.1.42 ; python_version >= "3.10" and python_version < "4.0"
22
+ greenlet==3.0.3 ; python_version >= "3.10" and python_version < "4.0" and platform_machine == "aarch64" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "ppc64le" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "x86_64" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "amd64" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "AMD64" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "win32" or python_version >= "3.10" and python_version < "4.0" and platform_machine == "WIN32"
23
+ huggingface-hub==0.21.4 ; python_version >= "3.10" and python_version < "4.0"
24
+ idna==3.6 ; python_version >= "3.10" and python_version < "4.0"
25
+ jinja2==3.1.3 ; python_version >= "3.10" and python_version < "4.0"
26
+ joblib==1.3.2 ; python_version >= "3.10" and python_version < "4.0"
27
+ jsonpatch==1.33 ; python_version >= "3.10" and python_version < "4.0"
28
+ jsonpointer==2.4 ; python_version >= "3.10" and python_version < "4.0"
29
+ jsonschema-specifications==2023.12.1 ; python_version >= "3.10" and python_version < "4.0"
30
+ jsonschema==4.21.1 ; python_version >= "3.10" and python_version < "4.0"
31
+ langchain-community==0.0.29 ; python_version >= "3.10" and python_version < "4.0"
32
+ langchain-core==0.1.33 ; python_version >= "3.10" and python_version < "4.0"
33
+ langchain-text-splitters==0.0.1 ; python_version >= "3.10" and python_version < "4.0"
34
+ langchain==0.1.13 ; python_version >= "3.10" and python_version < "4.0"
35
+ langsmith==0.1.31 ; python_version >= "3.10" and python_version < "4.0"
36
+ markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
37
+ markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "4.0"
38
+ marshmallow==3.21.1 ; python_version >= "3.10" and python_version < "4.0"
39
+ mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
40
+ mpmath==1.3.0 ; python_version >= "3.10" and python_version < "4.0"
41
+ multidict==6.0.5 ; python_version >= "3.10" and python_version < "4.0"
42
+ mypy-extensions==1.0.0 ; python_version >= "3.10" and python_version < "4.0"
43
+ networkx==3.2.1 ; python_version >= "3.10" and python_version < "4.0"
44
+ numpy==1.26.4 ; python_version < "4.0" and python_version >= "3.10"
45
+ nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
46
+ nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
47
+ nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
48
+ nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
49
+ nvidia-cudnn-cu12==8.9.2.26 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
50
+ nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
51
+ nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
52
+ nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
53
+ nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
54
+ nvidia-nccl-cu12==2.19.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
55
+ nvidia-nvjitlink-cu12==12.4.99 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
56
+ nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.10" and python_version < "4.0"
57
+ orjson==3.9.15 ; python_version >= "3.10" and python_version < "4.0"
58
+ packaging==23.2 ; python_version >= "3.10" and python_version < "4.0"
59
+ pandas==2.2.1 ; python_version >= "3.10" and python_version < "4.0"
60
+ pillow==10.2.0 ; python_version >= "3.10" and python_version < "4.0"
61
+ protobuf==4.25.3 ; python_version >= "3.10" and python_version < "4.0"
62
+ pyarrow==15.0.2 ; python_version >= "3.10" and python_version < "4.0"
63
+ pydantic-core==2.16.3 ; python_version >= "3.10" and python_version < "4.0"
64
+ pydantic==2.6.4 ; python_version >= "3.10" and python_version < "4.0"
65
+ pydeck==0.8.1b0 ; python_version >= "3.10" and python_version < "4.0"
66
+ pygments==2.17.2 ; python_version >= "3.10" and python_version < "4.0"
67
+ pypdf==4.1.0 ; python_version >= "3.10" and python_version < "4.0"
68
+ python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0"
69
+ pytz==2024.1 ; python_version >= "3.10" and python_version < "4.0"
70
+ pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0"
71
+ referencing==0.34.0 ; python_version >= "3.10" and python_version < "4.0"
72
+ regex==2023.12.25 ; python_version >= "3.10" and python_version < "4.0"
73
+ requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0"
74
+ rich==13.7.1 ; python_version >= "3.10" and python_version < "4.0"
75
+ rpds-py==0.18.0 ; python_version >= "3.10" and python_version < "4.0"
76
+ safetensors==0.4.2 ; python_version >= "3.10" and python_version < "4.0"
77
+ scikit-learn==1.4.1.post1 ; python_version >= "3.10" and python_version < "4.0"
78
+ scipy==1.12.0 ; python_version >= "3.10" and python_version < "4.0"
79
+ sentence-transformers==2.6.0 ; python_version >= "3.10" and python_version < "4.0"
80
+ six==1.16.0 ; python_version >= "3.10" and python_version < "4.0"
81
+ smmap==5.0.1 ; python_version >= "3.10" and python_version < "4.0"
82
+ sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
83
+ sqlalchemy==2.0.29 ; python_version >= "3.10" and python_version < "4.0"
84
+ streamlit==1.32.2 ; python_version >= "3.10" and python_version < "4.0"
85
+ sympy==1.12 ; python_version >= "3.10" and python_version < "4.0"
86
+ tenacity==8.2.3 ; python_version >= "3.10" and python_version < "4.0"
87
+ threadpoolctl==3.4.0 ; python_version >= "3.10" and python_version < "4.0"
88
+ tokenizers==0.15.2 ; python_version >= "3.10" and python_version < "4.0"
89
+ toml==0.10.2 ; python_version >= "3.10" and python_version < "4.0"
90
+ toolz==0.12.1 ; python_version >= "3.10" and python_version < "4.0"
91
+ torch==2.2.1 ; python_version >= "3.10" and python_version < "4.0"
92
+ tornado==6.4 ; python_version >= "3.10" and python_version < "4.0"
93
+ tqdm==4.66.2 ; python_version >= "3.10" and python_version < "4.0"
94
+ transformers==4.39.1 ; python_version >= "3.10" and python_version < "4.0"
95
+ triton==2.2.0 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" and python_version >= "3.10"
96
+ typing-extensions==4.10.0 ; python_version >= "3.10" and python_version < "4.0"
97
+ typing-inspect==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
98
+ tzdata==2024.1 ; python_version >= "3.10" and python_version < "4.0"
99
+ urllib3==2.2.1 ; python_version >= "3.10" and python_version < "4.0"
100
+ watchdog==4.0.0 ; python_version >= "3.10" and python_version < "4.0" and platform_system != "Darwin"
101
+ yarl==1.9.4 ; python_version >= "3.10" and python_version < "4.0"
save_mpi_mining_data_as_text_backup.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Save MPI Mining Data as Text Backup
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1bpGk38t5or9A4MCj8QVPLdTRH0D9qIN-
8
+ """
9
+
10
+ !git clone https://github.com/ricklon/mpi_data.git
11
+ # todo: pages store the enture colletion to embedding as well.
12
+
13
+ !pip install -q langchain pypdf langchain_community sentence_transformers faiss-cpu pandas tqdm
14
+
15
+ import os
16
+ import pandas as pd
17
+ from io import StringIO
18
+ from tqdm import tqdm
19
+ import time
20
+ import datetime
21
+
22
+ from langchain_core.prompts import ChatPromptTemplate
23
+ from langchain_core.runnables import RunnablePassthrough
24
+ from langchain_core.output_parsers import StrOutputParser
25
+
26
+ from langchain_community.vectorstores import FAISS
27
+ from langchain_community.embeddings import HuggingFaceEmbeddings
28
+ from langchain_community.llms import HuggingFaceEndpoint
29
+
30
+ from langchain.document_loaders import PyPDFDirectoryLoader
31
+
32
+ EMBEDDING_MODEL_NAME = "thenlper/gte-small"
33
+ HG_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
34
+ # hg_model = "HuggingFaceH4/zephyr-7b-beta"
35
+ #hg_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ## Tiny LLAMA not supported
36
+
37
+ llm = HuggingFaceEndpoint(repo_id=HG_MODEL,
38
+ max_new_tokens=250,
39
+ top_k=10,
40
+ top_p=0.95,
41
+ typical_p=0.95,
42
+ temperature=0.01,
43
+ repetition_penalty=1.035)
44
+
45
+ VECTOR_SOURCE="miningdata"
46
+
47
+ embedding_model = HuggingFaceEmbeddings(
48
+ model_name=EMBEDDING_MODEL_NAME,
49
+ multi_process=True,
50
+ model_kwargs={"device": "cpu"},
51
+ encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
52
+ )
utils_cli.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Function to export requirements using Poetry
4
+ export_requirements() {
5
+ poetry export --without-hashes --format=requirements.txt --output=requirements.txt
6
+ }
7
+
8
+ # Main function
9
+ main() {
10
+ echo "Choose an option:"
11
+ echo "1. Export requirements using Poetry"
12
+ echo "2. Option 2"
13
+ echo "3. Option 3"
14
+ echo "4. Option 4"
15
+ echo "5. Option 5"
16
+ read -p "Enter your choice [1-5]: " choice
17
+
18
+ case $choice in
19
+ 1)
20
+ export_requirements
21
+ ;;
22
+ 2)
23
+ echo "You chose option 2"
24
+ # Add your code for option 2 here
25
+ ;;
26
+ 3)
27
+ echo "You chose option 3"
28
+ # Add your code for option 3 here
29
+ ;;
30
+ 4)
31
+ echo "You chose option 4"
32
+ # Add your code for option 4 here
33
+ ;;
34
+ 5)
35
+ echo "You chose option 5"
36
+ # Add your code for option 5 here
37
+ ;;
38
+ *)
39
+ echo "Invalid choice. Please enter a number from 1 to 5."
40
+ ;;
41
+ esac
42
+ }
43
+
44
+ # Call the main function
45
+ main