# for generate (gradio server) and finetune
datasets==2.12.0
sentencepiece==0.1.97
gradio==3.31.0
huggingface_hub==0.14.1
appdirs==1.4.4
fire==0.5.0
docutils==0.19
torch==2.0.1
evaluate==0.4.0
rouge_score==0.1.2
sacrebleu==2.3.1
scikit-learn==1.2.2
alt-profanity-check==1.2.2
better-profanity==0.6.1
numpy==1.24.2
pandas==2.0.0
matplotlib==3.7.1
loralib==0.1.1
bitsandbytes==0.39.0
accelerate==0.19.0
git+https://github.com/huggingface/peft.git@3714aa2fff158fdfa637b2b65952580801d890b2
transformers==4.28.1
tokenizers==0.13.3
APScheduler==3.10.1

# optional for generate
pynvml==11.5.0
psutil==5.9.4
boto3==1.26.101
botocore==1.29.101

# optional for finetune
tensorboard==2.12.1
neptune==1.1.1

# for gradio client
gradio_client==0.2.5
beautifulsoup4==4.12.2
markdown==3.4.1

# data and testing
pytest==7.2.2
pytest-xdist==3.2.1
nltk==3.8.1
textstat==0.7.3
pandoc==2.3
#pypandoc==1.11
pypandoc_binary==1.11
openpyxl==3.1.2
lm_dataformat==0.0.20
bioc==2.0

# falcon
einops==0.6.1
instructorembedding==1.0.1

# for gpt4all .env file, but avoid worrying about imports
python-dotenv==1.0.0# optional for chat with PDF
langchain==0.0.193
pypdf==3.8.1
tiktoken==0.3.3
# avoid textract, requires old six
#textract==1.6.5

# for HF embeddings
sentence_transformers==2.2.2
# for OpenAI embeddings (requires key)
openai==0.27.6

# local vector db
chromadb==0.3.25
# server vector db
#pymilvus==2.2.8

# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
# unstructured==0.6.6

# strong support for images
# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libreoffice
unstructured[local-inference]==0.6.6
#pdf2image==1.16.3
#pytesseract==0.3.10
pillow

pdfminer.six==20221105
urllib3==1.26.6
requests_file==1.5.1

#pdf2image==1.16.3
#pytesseract==0.3.10
tabulate==0.9.0
# FYI pandoc already part of requirements.txt

# JSONLoader, but makes some trouble for some users
# jq==1.4.1

# to check licenses
# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
pip-licenses==4.3.0

# weaviate vector db
weaviate-client==3.19.2# optional for chat with PDF
langchain==0.0.193
pypdf==3.8.1
tiktoken==0.3.3
# avoid textract, requires old six
#textract==1.6.5

# for HF embeddings
sentence_transformers==2.2.2
# for OpenAI embeddings (requires key)
openai==0.27.6

# local vector db
chromadb==0.3.25
# server vector db
#pymilvus==2.2.8

# weak url support, if can't install opencv etc. If comment-in this one, then comment-out unstructured[local-inference]==0.6.6
# unstructured==0.6.6

# strong support for images
# Requires on Ubuntu: sudo apt-get install libmagic-dev poppler-utils tesseract-ocr libreoffice
unstructured[local-inference]==0.6.6
#pdf2image==1.16.3
#pytesseract==0.3.10
pillow

pdfminer.six==20221105
urllib3==1.26.6
requests_file==1.5.1

#pdf2image==1.16.3
#pytesseract==0.3.10
tabulate==0.9.0
# FYI pandoc already part of requirements.txt

# JSONLoader, but makes some trouble for some users
# jq==1.4.1

# to check licenses
# Run: pip-licenses|grep -v 'BSD\|Apache\|MIT'
pip-licenses==4.3.0

# weaviate vector db
weaviate-client==3.19.2faiss-gpu==1.7.2
gpt4all==0.2.3
llama-cpp-python==0.1.55
arxiv==1.4.7
pymupdf==1.22.3 # AGPL license
# extract-msg==0.41.1  # GPL3