Carlos Salgado commited on
Commit
06e72ee
1 Parent(s): 685ce53

remove unnessesary draft files, ignore vscode config

Browse files
.gitignore CHANGED
@@ -1,7 +1,9 @@
1
  .envrc
2
  .direnv/
3
  .env
4
- .venv
5
  .ipynb_checkpoints
6
  flake.nix
7
  *__pycache__*
 
 
 
1
  .envrc
2
  .direnv/
3
  .env
4
+ *venv
5
  .ipynb_checkpoints
6
  flake.nix
7
  *__pycache__*
8
+ .idea
9
+
backend/langchain_vectara.ipynb DELETED
@@ -1,900 +0,0 @@
1
- {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": []
7
- },
8
- "kernelspec": {
9
- "name": "python3",
10
- "display_name": "Python 3"
11
- },
12
- "language_info": {
13
- "name": "python"
14
- }
15
- },
16
- "cells": [
17
- {
18
- "cell_type": "code",
19
- "source": [
20
- "!pip install -r requirements.txt"
21
- ],
22
- "metadata": {
23
- "colab": {
24
- "base_uri": "https://localhost:8080/"
25
- },
26
- "id": "lVZX4hy1Ruq_",
27
- "outputId": "0c963932-2266-4c44-d671-07dc23625bae"
28
- },
29
- "execution_count": 2,
30
- "outputs": [
31
- {
32
- "output_type": "stream",
33
- "name": "stdout",
34
- "text": [
35
- "Requirement already satisfied: langchain_community in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (0.0.32)\n",
36
- "Requirement already satisfied: langchain-text-splitters in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (0.0.1)\n",
37
- "Requirement already satisfied: langchain-together in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (0.1.0)\n",
38
- "Requirement already satisfied: pdf2image in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (1.17.0)\n",
39
- "Requirement already satisfied: pdfminer.six in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (20231228)\n",
40
- "Requirement already satisfied: pillow_heif in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (0.16.0)\n",
41
- "Requirement already satisfied: doctran in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 8)) (0.0.14)\n",
42
- "Collecting python-dotenv (from -r requirements.txt (line 9))\n",
43
- " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
44
- "Requirement already satisfied: unstructured[local-inference] in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (0.13.2)\n",
45
- "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (6.0.1)\n",
46
- "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (2.0.29)\n",
47
- "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (3.9.3)\n",
48
- "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.6.4)\n",
49
- "Requirement already satisfied: langchain-core<0.2.0,>=0.1.41 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.1.42)\n",
50
- "Requirement already satisfied: langsmith<0.2.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.1.47)\n",
51
- "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (1.25.2)\n",
52
- "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (2.31.0)\n",
53
- "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (8.2.3)\n",
54
- "Requirement already satisfied: together<0.3.0,>=0.2.10 in /usr/local/lib/python3.10/dist-packages (from langchain-together->-r requirements.txt (line 3)) (0.2.11)\n",
55
- "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (5.2.0)\n",
56
- "Requirement already satisfied: filetype in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.2.0)\n",
57
- "Requirement already satisfied: python-magic in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.4.27)\n",
58
- "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.9.4)\n",
59
- "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.8.1)\n",
60
- "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.9.0)\n",
61
- "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.12.3)\n",
62
- "Requirement already satisfied: emoji in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.11.0)\n",
63
- "Requirement already satisfied: python-iso639 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2024.2.7)\n",
64
- "Requirement already satisfied: langdetect in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.0.9)\n",
65
- "Requirement already satisfied: rapidfuzz in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.8.1)\n",
66
- "Requirement already satisfied: backoff in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.2.1)\n",
67
- "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.11.0)\n",
68
- "Requirement already satisfied: unstructured-client<=0.18.0 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.18.0)\n",
69
- "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.14.1)\n",
70
- "Requirement already satisfied: unstructured.pytesseract>=0.3.12 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.3.12)\n",
71
- "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.3)\n",
72
- "Requirement already satisfied: pypdf in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.2.0)\n",
73
- "Requirement already satisfied: python-pptx<=0.6.23 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.6.23)\n",
74
- "Requirement already satisfied: pypandoc in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.13)\n",
75
- "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.6)\n",
76
- "Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.1.2)\n",
77
- "Requirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.0.1)\n",
78
- "Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.1.0)\n",
79
- "Requirement already satisfied: pikepdf in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (8.15.0)\n",
80
- "Requirement already satisfied: unstructured-inference==0.7.25 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.7.25)\n",
81
- "Requirement already satisfied: onnx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.16.0)\n",
82
- "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.0.3)\n",
83
- "Requirement already satisfied: msg-parser in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.2.0)\n",
84
- "Requirement already satisfied: layoutparser[layoutmodels,tesseract] in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.3.4)\n",
85
- "Requirement already satisfied: python-multipart in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.0.9)\n",
86
- "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.20.3)\n",
87
- "Requirement already satisfied: opencv-python!=4.7.0.68 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (4.8.0.76)\n",
88
- "Requirement already satisfied: onnxruntime<1.16 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (1.15.1)\n",
89
- "Requirement already satisfied: transformers>=4.25.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (4.38.2)\n",
90
- "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image->-r requirements.txt (line 5)) (10.3.0)\n",
91
- "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six->-r requirements.txt (line 6)) (3.3.2)\n",
92
- "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six->-r requirements.txt (line 6)) (42.0.5)\n",
93
- "Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (0.27.10)\n",
94
- "Requirement already satisfied: presidio-analyzer<3.0.0,>=2.2.33 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (2.2.354)\n",
95
- "Requirement already satisfied: presidio-anonymizer<3.0.0,>=2.2.33 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (2.2.354)\n",
96
- "Requirement already satisfied: pydantic<2.0.0,>=1.10.9 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (1.10.15)\n",
97
- "Requirement already satisfied: spacy<4.0.0,>=3.5.4 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (3.7.4)\n",
98
- "Requirement already satisfied: tiktoken<0.6.0,>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (0.5.2)\n",
99
- "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.3.1)\n",
100
- "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (23.2.0)\n",
101
- "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.4.1)\n",
102
- "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (6.0.5)\n",
103
- "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.9.4)\n",
104
- "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (4.0.3)\n",
105
- "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six->-r requirements.txt (line 6)) (1.16.0)\n",
106
- "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community->-r requirements.txt (line 1)) (3.21.1)\n",
107
- "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community->-r requirements.txt (line 1)) (0.9.0)\n",
108
- "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community->-r requirements.txt (line 1)) (1.33)\n",
109
- "Requirement already satisfied: packaging<24.0,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community->-r requirements.txt (line 1)) (23.2)\n",
110
- "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.0->langchain_community->-r requirements.txt (line 1)) (3.10.0)\n",
111
- "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->doctran->-r requirements.txt (line 8)) (4.66.2)\n",
112
- "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (2023.12.25)\n",
113
- "Requirement already satisfied: tldextract in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (5.1.2)\n",
114
- "Requirement already satisfied: phonenumbers<9.0.0,>=8.12 in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (8.13.34)\n",
115
- "Requirement already satisfied: pycryptodome>=3.10.1 in /usr/local/lib/python3.10/dist-packages (from presidio-anonymizer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (3.20.0)\n",
116
- "Requirement already satisfied: XlsxWriter>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from python-pptx<=0.6.23->unstructured[local-inference]->-r requirements.txt (line 4)) (3.2.0)\n",
117
- "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (3.6)\n",
118
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (2.0.7)\n",
119
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (2024.2.2)\n",
120
- "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.0.12)\n",
121
- "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.0.5)\n",
122
- "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.0.10)\n",
123
- "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.0.8)\n",
124
- "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.0.9)\n",
125
- "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (8.2.3)\n",
126
- "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.1.2)\n",
127
- "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.4.8)\n",
128
- "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.0.10)\n",
129
- "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (0.3.4)\n",
130
- "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (0.9.4)\n",
131
- "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (6.4.0)\n",
132
- "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.1.3)\n",
133
- "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (69.5.0)\n",
134
- "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.3.0)\n",
135
- "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community->-r requirements.txt (line 1)) (3.0.3)\n",
136
- "INFO: pip is looking at multiple versions of together to determine which version is compatible with other requirements. This could take a while.\n",
137
- "Collecting together<0.3.0,>=0.2.10 (from langchain-together->-r requirements.txt (line 3))\n",
138
- " Downloading together-0.2.10-py3-none-any.whl.metadata (26 kB)\n",
139
- "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
140
- " Downloading spacy-3.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)\n",
141
- " Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
142
- " Downloading spacy-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
143
- "Collecting pathy>=0.10.0 (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8))\n",
144
- " Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)\n",
145
- "INFO: pip is still looking at multiple versions of together to determine which version is compatible with other requirements. This could take a while.\n",
146
- "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
147
- " Downloading spacy-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
148
- " Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
149
- "Collecting thinc<8.2.0,>=8.1.8 (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8))\n",
150
- " Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n",
151
- "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
152
- " Downloading spacy-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n",
153
- "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n",
154
- " Downloading spacy-3.5.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
155
- "Collecting presidio-analyzer<3.0.0,>=2.2.33 (from doctran->-r requirements.txt (line 8))\n",
156
- " Downloading presidio_analyzer-2.2.354-py3-none-any.whl.metadata (2.6 kB)\n",
157
- "Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community->-r requirements.txt (line 1))\n",
158
- " Downloading langsmith-0.1.47-py3-none-any.whl.metadata (13 kB)\n",
159
- " Downloading langsmith-0.1.46-py3-none-any.whl.metadata (13 kB)\n",
160
- " Downloading langsmith-0.1.45-py3-none-any.whl.metadata (13 kB)\n",
161
- " Downloading langsmith-0.1.44-py3-none-any.whl.metadata (13 kB)\n",
162
- " Downloading langsmith-0.1.43-py3-none-any.whl.metadata (13 kB)\n",
163
- " Downloading langsmith-0.1.42-py3-none-any.whl.metadata (13 kB)\n",
164
- " Downloading langsmith-0.1.41-py3-none-any.whl.metadata (13 kB)\n",
165
- " Downloading langsmith-0.1.40-py3-none-any.whl.metadata (13 kB)\n",
166
- " Downloading langsmith-0.1.39-py3-none-any.whl.metadata (13 kB)\n",
167
- " Downloading langsmith-0.1.38-py3-none-any.whl.metadata (13 kB)\n",
168
- " Downloading langsmith-0.1.37-py3-none-any.whl.metadata (13 kB)\n",
169
- " Downloading langsmith-0.1.36-py3-none-any.whl.metadata (13 kB)\n",
170
- " Downloading langsmith-0.1.35-py3-none-any.whl.metadata (13 kB)\n",
171
- " Downloading langsmith-0.1.34-py3-none-any.whl.metadata (13 kB)\n",
172
- " Downloading langsmith-0.1.33-py3-none-any.whl.metadata (13 kB)\n",
173
- " Downloading langsmith-0.1.31-py3-none-any.whl.metadata (13 kB)\n",
174
- " Downloading langsmith-0.1.30-py3-none-any.whl.metadata (13 kB)\n",
175
- " Downloading langsmith-0.1.29-py3-none-any.whl.metadata (13 kB)\n",
176
- " Downloading langsmith-0.1.28-py3-none-any.whl.metadata (13 kB)\n",
177
- " Downloading langsmith-0.1.27-py3-none-any.whl.metadata (13 kB)\n",
178
- " Downloading langsmith-0.1.26-py3-none-any.whl.metadata (13 kB)\n",
179
- " Downloading langsmith-0.1.25-py3-none-any.whl.metadata (13 kB)\n",
180
- " Downloading langsmith-0.1.24-py3-none-any.whl.metadata (13 kB)\n",
181
- " Downloading langsmith-0.1.23-py3-none-any.whl.metadata (13 kB)\n",
182
- " Downloading langsmith-0.1.22-py3-none-any.whl.metadata (13 kB)\n",
183
- " Downloading langsmith-0.1.21-py3-none-any.whl.metadata (13 kB)\n",
184
- " Downloading langsmith-0.1.20-py3-none-any.whl.metadata (13 kB)\n",
185
- " Downloading langsmith-0.1.19-py3-none-any.whl.metadata (13 kB)\n",
186
- " Downloading langsmith-0.1.18-py3-none-any.whl.metadata (13 kB)\n",
187
- " Downloading langsmith-0.1.17-py3-none-any.whl.metadata (13 kB)\n",
188
- " Downloading langsmith-0.1.16-py3-none-any.whl.metadata (13 kB)\n",
189
- " Downloading langsmith-0.1.15-py3-none-any.whl.metadata (13 kB)\n",
190
- " Downloading langsmith-0.1.14-py3-none-any.whl.metadata (13 kB)\n",
191
- " Downloading langsmith-0.1.13-py3-none-any.whl.metadata (13 kB)\n",
192
- " Downloading langsmith-0.1.12-py3-none-any.whl.metadata (13 kB)\n",
193
- " Downloading langsmith-0.1.11-py3-none-any.whl.metadata (13 kB)\n",
194
- " Downloading langsmith-0.1.10-py3-none-any.whl.metadata (13 kB)\n",
195
- " Downloading langsmith-0.1.9-py3-none-any.whl.metadata (13 kB)\n",
196
- " Downloading langsmith-0.1.8-py3-none-any.whl.metadata (13 kB)\n",
197
- " Downloading langsmith-0.1.7-py3-none-any.whl.metadata (13 kB)\n",
198
- " Downloading langsmith-0.1.6-py3-none-any.whl.metadata (13 kB)\n",
199
- " Downloading langsmith-0.1.5-py3-none-any.whl.metadata (13 kB)\n",
200
- " Downloading langsmith-0.1.4-py3-none-any.whl.metadata (13 kB)\n",
201
- " Downloading langsmith-0.1.3-py3-none-any.whl.metadata (13 kB)\n",
202
- " Downloading langsmith-0.1.2-py3-none-any.whl.metadata (13 kB)\n",
203
- " Downloading langsmith-0.1.1-py3-none-any.whl.metadata (13 kB)\n",
204
- " Downloading langsmith-0.1.0-py3-none-any.whl.metadata (13 kB)\n",
205
- "Collecting langchain-core<0.2.0,>=0.1.41 (from langchain_community->-r requirements.txt (line 1))\n",
206
- " Downloading langchain_core-0.1.42-py3-none-any.whl.metadata (5.9 kB)\n",
207
- " Downloading langchain_core-0.1.41-py3-none-any.whl.metadata (5.9 kB)\n",
208
- "Collecting doctran (from -r requirements.txt (line 8))\n",
209
- " Downloading doctran-0.0.14-py3-none-any.whl.metadata (8.6 kB)\n",
210
- " Downloading doctran-0.0.13-py3-none-any.whl.metadata (8.6 kB)\n",
211
- " Downloading doctran-0.0.12-py3-none-any.whl.metadata (8.6 kB)\n",
212
- " Downloading doctran-0.0.11-py3-none-any.whl.metadata (8.5 kB)\n",
213
- " Downloading doctran-0.0.10-py3-none-any.whl.metadata (8.9 kB)\n",
214
- "Collecting tiktoken<0.5.0,>=0.4.0 (from doctran->-r requirements.txt (line 8))\n",
215
- " Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
216
- "Collecting doctran (from -r requirements.txt (line 8))\n",
217
- " Downloading doctran-0.0.9-py3-none-any.whl.metadata (8.9 kB)\n",
218
- "Collecting tiktoken<0.4.0,>=0.3.3 (from doctran->-r requirements.txt (line 8))\n",
219
- " Downloading tiktoken-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
220
- "Collecting doctran (from -r requirements.txt (line 8))\n",
221
- " Downloading doctran-0.0.8-py3-none-any.whl.metadata (8.9 kB)\n",
222
- " Downloading doctran-0.0.7-py3-none-any.whl.metadata (8.9 kB)\n",
223
- " Downloading doctran-0.0.6-py3-none-any.whl.metadata (8.9 kB)\n",
224
- " Downloading doctran-0.0.5-py3-none-any.whl.metadata (8.9 kB)\n",
225
- " Downloading doctran-0.0.4-py3-none-any.whl.metadata (8.9 kB)\n",
226
- " Downloading doctran-0.0.3-py3-none-any.whl.metadata (8.8 kB)\n",
227
- " Downloading doctran-0.0.2-py3-none-any.whl.metadata (8.9 kB)\n",
228
- "Collecting bs4<0.0.2,>=0.0.1 (from doctran->-r requirements.txt (line 8))\n",
229
- " Downloading bs4-0.0.1.tar.gz (1.1 kB)\n",
230
- " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
231
- "Requirement already satisfied: jsonschema<5.0.0,>=4.17.3 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (4.19.2)\n",
232
- "Collecting mailbox<0.5,>=0.4 (from doctran->-r requirements.txt (line 8))\n",
233
- " Downloading mailbox-0.4.tar.gz (4.1 kB)\n",
234
- " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
235
- "Collecting pdfplumber<0.10.0,>=0.9.0 (from doctran->-r requirements.txt (line 8))\n",
236
- " Downloading pdfplumber-0.9.0-py3-none-any.whl.metadata (35 kB)\n",
237
- "Collecting doctran (from -r requirements.txt (line 8))\n",
238
- " Downloading doctran-0.0.1-py3-none-any.whl.metadata (5.5 kB)\n",
239
- " Downloading doctran-0.0.0-py3-none-any.whl.metadata (599 bytes)\n",
240
- "Collecting langchain-together (from -r requirements.txt (line 3))\n",
241
- " Downloading langchain_together-0.1.0-py3-none-any.whl.metadata (1.9 kB)\n",
242
- " Downloading langchain_together-0.0.2.post2-py3-none-any.whl.metadata (1.9 kB)\n",
243
- " Downloading langchain_together-0.0.2.post1-py3-none-any.whl.metadata (806 bytes)\n",
244
- " Downloading langchain_together-0.0.2-py3-none-any.whl.metadata (577 bytes)\n",
245
- " Downloading langchain_together-0.0.1-py3-none-any.whl.metadata (504 bytes)\n",
246
- "Collecting langchain-text-splitters (from -r requirements.txt (line 2))\n",
247
- " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)\n",
248
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
249
- " Downloading langchain_community-0.0.32-py3-none-any.whl.metadata (8.5 kB)\n",
250
- " Downloading langchain_community-0.0.31-py3-none-any.whl.metadata (8.4 kB)\n",
251
- "Collecting langchain-core<0.2.0,>=0.1.37 (from langchain_community->-r requirements.txt (line 1))\n",
252
- " Downloading langchain_core-0.1.40-py3-none-any.whl.metadata (5.9 kB)\n",
253
- " Downloading langchain_core-0.1.39-py3-none-any.whl.metadata (5.9 kB)\n",
254
- " Downloading langchain_core-0.1.38-py3-none-any.whl.metadata (6.0 kB)\n",
255
- " Downloading langchain_core-0.1.37-py3-none-any.whl.metadata (6.0 kB)\n",
256
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
257
- " Downloading langchain_community-0.0.30-py3-none-any.whl.metadata (8.4 kB)\n",
258
- " Downloading langchain_community-0.0.29-py3-none-any.whl.metadata (8.3 kB)\n",
259
- "Collecting langchain-core<0.2.0,>=0.1.33 (from langchain_community->-r requirements.txt (line 1))\n",
260
- " Downloading langchain_core-0.1.36-py3-none-any.whl.metadata (6.0 kB)\n",
261
- " Downloading langchain_core-0.1.35-py3-none-any.whl.metadata (6.0 kB)\n",
262
- " Downloading langchain_core-0.1.34-py3-none-any.whl.metadata (6.0 kB)\n",
263
- " Downloading langchain_core-0.1.33-py3-none-any.whl.metadata (6.0 kB)\n",
264
- "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.33->langchain_community->-r requirements.txt (line 1)) (3.7.1)\n",
265
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
266
- " Downloading langchain_community-0.0.28-py3-none-any.whl.metadata (8.3 kB)\n",
267
- "Collecting langchain-core<0.2.0,>=0.1.31 (from langchain_community->-r requirements.txt (line 1))\n",
268
- " Downloading langchain_core-0.1.32-py3-none-any.whl.metadata (6.0 kB)\n",
269
- " Downloading langchain_core-0.1.31-py3-none-any.whl.metadata (6.0 kB)\n",
270
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
271
- " Downloading langchain_community-0.0.27-py3-none-any.whl.metadata (8.2 kB)\n",
272
- "Collecting langchain-core<0.2.0,>=0.1.30 (from langchain_community->-r requirements.txt (line 1))\n",
273
- " Downloading langchain_core-0.1.30-py3-none-any.whl.metadata (6.0 kB)\n",
274
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
275
- " Downloading langchain_community-0.0.26-py3-none-any.whl.metadata (8.2 kB)\n",
276
- "Collecting langchain-core<0.2.0,>=0.1.29 (from langchain_community->-r requirements.txt (line 1))\n",
277
- " Downloading langchain_core-0.1.29-py3-none-any.whl.metadata (6.0 kB)\n",
278
- "Collecting langchain_community (from -r requirements.txt (line 1))\n",
279
- " Downloading langchain_community-0.0.25-py3-none-any.whl.metadata (8.1 kB)\n",
280
- "\u001b[31mERROR: Exception:\n",
281
- "Traceback (most recent call last):\n",
282
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 180, in exc_logging_wrapper\n",
283
- " status = run_func(*args)\n",
284
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py\", line 245, in wrapper\n",
285
- " return func(self, options, args)\n",
286
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py\", line 377, in run\n",
287
- " requirement_set = resolver.resolve(\n",
288
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py\", line 95, in resolve\n",
289
- " result = self._result = resolver.resolve(\n",
290
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py\", line 546, in resolve\n",
291
- " state = resolution.resolve(requirements, max_rounds=max_rounds)\n",
292
- " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py\", line 457, in resolve\n",
293
- " raise ResolutionTooDeep(max_rounds)\n",
294
- "pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000\u001b[0m\u001b[31m\n",
295
- "\u001b[0m"
296
- ]
297
- }
298
- ]
299
- },
300
- {
301
- "cell_type": "code",
302
- "source": [
303
- "!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif"
304
- ],
305
- "metadata": {
306
- "colab": {
307
- "base_uri": "https://localhost:8080/",
308
- "height": 1000
309
- },
310
- "id": "j06J9xE60u0C",
311
- "outputId": "06248856-b7d5-402f-f38d-03e475f2786b"
312
- },
313
- "execution_count": 1,
314
- "outputs": [
315
- {
316
- "output_type": "stream",
317
- "name": "stdout",
318
- "text": [
319
- "Collecting langchain_community\n",
320
- " Downloading langchain_community-0.0.32-py3-none-any.whl (1.9 MB)\n",
321
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
322
- "\u001b[?25hCollecting langchain-text-splitters\n",
323
- " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n",
324
- "Collecting unstructured[local-inference]\n",
325
- " Downloading unstructured-0.13.2-py3-none-any.whl (1.9 MB)\n",
326
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
327
- "\u001b[?25hCollecting pdf2image\n",
328
- " Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
329
- "Collecting pdfminer.six\n",
330
- " Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n",
331
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
332
- "\u001b[?25hCollecting langchain-together\n",
333
- " Downloading langchain_together-0.1.0-py3-none-any.whl (6.7 kB)\n",
334
- "Collecting pillow_heif\n",
335
- " Downloading pillow_heif-0.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.5 MB)\n",
336
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.5/7.5 MB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
337
- "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (6.0.1)\n",
338
- "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.0.29)\n",
339
- "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (3.9.3)\n",
340
- "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)\n",
341
- " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n",
342
- "Collecting langchain-core<0.2.0,>=0.1.41 (from langchain_community)\n",
343
- " Downloading langchain_core-0.1.42-py3-none-any.whl (287 kB)\n",
344
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.5/287.5 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
345
- "\u001b[?25hCollecting langsmith<0.2.0,>=0.1.0 (from langchain_community)\n",
346
- " Downloading langsmith-0.1.47-py3-none-any.whl (113 kB)\n",
347
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.0/113.0 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
348
- "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (1.25.2)\n",
349
- "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.31.0)\n",
350
- "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (8.2.3)\n",
351
- "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (5.2.0)\n",
352
- "Collecting filetype (from unstructured[local-inference])\n",
353
- " Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)\n",
354
- "Collecting python-magic (from unstructured[local-inference])\n",
355
- " Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)\n",
356
- "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.9.4)\n",
357
- "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.8.1)\n",
358
- "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (0.9.0)\n",
359
- "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.12.3)\n",
360
- "Collecting emoji (from unstructured[local-inference])\n",
361
- " Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)\n",
362
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m433.8/433.8 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
363
- "\u001b[?25hCollecting python-iso639 (from unstructured[local-inference])\n",
364
- " Downloading python_iso639-2024.2.7-py3-none-any.whl (274 kB)\n",
365
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
366
- "\u001b[?25hCollecting langdetect (from unstructured[local-inference])\n",
367
- " Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
368
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m38.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
369
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
370
- "Collecting rapidfuzz (from unstructured[local-inference])\n",
371
- " Downloading rapidfuzz-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
372
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
373
- "\u001b[?25hCollecting backoff (from unstructured[local-inference])\n",
374
- " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
375
- "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.11.0)\n",
376
- "Collecting unstructured-client<=0.18.0 (from unstructured[local-inference])\n",
377
- " Downloading unstructured_client-0.18.0-py3-none-any.whl (21 kB)\n",
378
- "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (1.14.1)\n",
379
- "Collecting unstructured.pytesseract>=0.3.12 (from unstructured[local-inference])\n",
380
- " Downloading unstructured.pytesseract-0.3.12-py3-none-any.whl (14 kB)\n",
381
- "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.3)\n",
382
- "Collecting pypdf (from unstructured[local-inference])\n",
383
- " Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)\n",
384
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
385
- "\u001b[?25hCollecting python-pptx<=0.6.23 (from unstructured[local-inference])\n",
386
- " Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)\n",
387
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
388
- "\u001b[?25hCollecting pypandoc (from unstructured[local-inference])\n",
389
- " Downloading pypandoc-1.13-py3-none-any.whl (21 kB)\n",
390
- "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.6)\n",
391
- "Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.1.2)\n",
392
- "Requirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (2.0.1)\n",
393
- "Collecting python-docx (from unstructured[local-inference])\n",
394
- " Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)\n",
395
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.6/239.6 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
396
- "\u001b[?25hCollecting pikepdf (from unstructured[local-inference])\n",
397
- " Downloading pikepdf-8.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n",
398
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
399
- "\u001b[?25hCollecting unstructured-inference==0.7.25 (from unstructured[local-inference])\n",
400
- " Downloading unstructured_inference-0.7.25-py3-none-any.whl (58 kB)\n",
401
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.9/58.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
402
- "\u001b[?25hCollecting onnx (from unstructured[local-inference])\n",
403
- " Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)\n",
404
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
405
- "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (2.0.3)\n",
406
- "Collecting msg-parser (from unstructured[local-inference])\n",
407
- " Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)\n",
408
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━���━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
409
- "\u001b[?25hCollecting layoutparser[layoutmodels,tesseract] (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
410
- " Downloading layoutparser-0.3.4-py3-none-any.whl (19.2 MB)\n",
411
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.2/19.2 MB\u001b[0m \u001b[31m39.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
412
- "\u001b[?25hCollecting python-multipart (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
413
- " Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n",
414
- "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (0.20.3)\n",
415
- "Requirement already satisfied: opencv-python!=4.7.0.68 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (4.8.0.76)\n",
416
- "Collecting onnxruntime<1.16 (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
417
- " Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n",
418
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m58.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
419
- "\u001b[?25hRequirement already satisfied: transformers>=4.25.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (4.38.2)\n",
420
- "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (9.4.0)\n",
421
- "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (3.3.2)\n",
422
- "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (42.0.5)\n",
423
- "Collecting together<0.3.0,>=0.2.10 (from langchain-together)\n",
424
- " Downloading together-0.2.11-py3-none-any.whl (43 kB)\n",
425
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
426
- "\u001b[?25hCollecting pillow (from pdf2image)\n",
427
- " Downloading pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)\n",
428
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
429
- "\u001b[?25hRequirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.3.1)\n",
430
- "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (23.2.0)\n",
431
- "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.4.1)\n",
432
- "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.0.5)\n",
433
- "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.9.4)\n",
434
- "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (4.0.3)\n",
435
- "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six) (1.16.0)\n",
436
- "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
437
- " Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n",
438
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
439
- "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
440
- " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
441
- "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
442
- " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n",
443
- "Collecting packaging<24.0,>=23.2 (from langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
444
- " Downloading packaging-23.2-py3-none-any.whl (53 kB)\n",
445
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
446
- "\u001b[?25hRequirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community) (2.6.4)\n",
447
- "Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.0->langchain_community)\n",
448
- " Downloading orjson-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n",
449
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.8/144.8 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
450
- "\u001b[?25hCollecting XlsxWriter>=0.5.7 (from python-pptx<=0.6.23->unstructured[local-inference])\n",
451
- " Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)\n",
452
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m159.9/159.9 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
453
- "\u001b[?25hRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.6)\n",
454
- "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2.0.7)\n",
455
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2024.2.2)\n",
456
- "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community) (3.0.3)\n",
457
- "Collecting sseclient-py<2.0.0,>=1.7.2 (from together<0.3.0,>=0.2.10->langchain-together)\n",
458
- " Downloading sseclient_py-1.8.0-py2.py3-none-any.whl (8.8 kB)\n",
459
- "Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in /usr/local/lib/python3.10/dist-packages (from together<0.3.0,>=0.2.10->langchain-together) (4.66.2)\n",
460
- "Requirement already satisfied: typer<0.10.0,>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from together<0.3.0,>=0.2.10->langchain-together) (0.9.4)\n",
461
- "Collecting dataclasses-json-speakeasy>=0.5.11 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
462
- " Downloading dataclasses_json_speakeasy-0.5.11-py3-none-any.whl (28 kB)\n",
463
- "Collecting jsonpath-python>=1.0.6 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
464
- " Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)\n",
465
- "Collecting mypy-extensions>=1.0.0 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
466
- " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
467
- "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from unstructured-client<=0.18.0->unstructured[local-inference]) (2.8.2)\n",
468
- "Requirement already satisfied: six>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client<=0.18.0->unstructured[local-inference]) (1.16.0)\n",
469
- "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->unstructured[local-inference]) (2.5)\n",
470
- "Collecting olefile>=0.46 (from msg-parser->unstructured[local-inference])\n",
471
- " Downloading olefile-0.47-py2.py3-none-any.whl (114 kB)\n",
472
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.6/114.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
473
- "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (8.1.7)\n",
474
- "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (1.4.0)\n",
475
- "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (2023.12.25)\n",
476
- "Requirement already satisfied: protobuf>=3.20.2 in /usr/local/lib/python3.10/dist-packages (from onnx->unstructured[local-inference]) (3.20.3)\n",
477
- "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl->unstructured[local-inference]) (1.1.0)\n",
478
- "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->unstructured[local-inference]) (2023.4)\n",
479
- "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->unstructured[local-inference]) (2024.1)\n",
480
- "Collecting Deprecated (from pikepdf->unstructured[local-inference])\n",
481
- " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n",
482
- "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six) (2.22)\n",
483
- "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
484
- " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n",
485
- "Collecting coloredlogs (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference])\n",
486
- " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
487
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
488
- "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (24.3.25)\n",
489
- "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (1.12)\n",
490
- "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.2.0,>=0.1.41->langchain_community) (0.6.0)\n",
491
- "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.2.0,>=0.1.41->langchain_community) (2.16.3)\n",
492
- "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (3.13.4)\n",
493
- "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (0.15.2)\n",
494
- "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (0.4.2)\n",
495
- "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->unstructured-inference==0.7.25->unstructured[local-inference]) (2023.6.0)\n",
496
- "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.11.4)\n",
497
- "Collecting iopath (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
498
- " Downloading iopath-0.1.10.tar.gz (42 kB)\n",
499
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
500
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
501
- "Collecting pdfplumber (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
502
- " Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)\n",
503
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.4/56.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
504
- "\u001b[?25hCollecting pytesseract (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
505
- " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
506
- "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.2.1+cu121)\n",
507
- "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (0.17.1+cu121)\n",
508
- "Collecting effdet (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
509
- " Downloading effdet-0.4.1-py3-none-any.whl (112 kB)\n",
510
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.5/112.5 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
511
- "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference])\n",
512
- " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
513
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
514
- "\u001b[?25hCollecting timm>=0.9.2 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
515
- " Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)\n",
516
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m52.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
517
- "\u001b[?25hRequirement already satisfied: pycocotools>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.0.7)\n",
518
- "Collecting omegaconf>=2.0 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
519
- " Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
520
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
521
- "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.1.3)\n",
522
- "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
523
- " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
524
- "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
525
- " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
526
- "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
527
- " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
528
- "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
529
- " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
530
- "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
531
- " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
532
- "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
533
- " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
534
- "Collecting nvidia-curand-cu12==10.3.2.106 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
535
- " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
536
- "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
537
- " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
538
- "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
539
- " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
540
- "Collecting nvidia-nccl-cu12==2.19.3 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
541
- " Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n",
542
- "Collecting nvidia-nvtx-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
543
- " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
544
- "Requirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.2.0)\n",
545
- "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
546
- " Using cached nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
547
- "Collecting portalocker (from iopath->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
548
- " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
549
- "Collecting pypdfium2>=4.18.0 (from pdfplumber->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
550
- " Downloading pypdfium2-4.29.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
551
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m76.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
552
- "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (1.3.0)\n",
553
- "Collecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.0->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
554
- " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n",
555
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
556
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
557
- "Requirement already satisfied: matplotlib>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.7.1)\n",
558
- "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.1.5)\n",
559
- "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.2.1)\n",
560
- "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (0.12.1)\n",
561
- "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (4.51.0)\n",
562
- "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.4.5)\n",
563
- "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.1.2)\n",
564
- "Building wheels for collected packages: langdetect, iopath, antlr4-python3-runtime\n",
565
- " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
566
- " Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=e39cbb9b4aa1aad74d62b4cff3f3c84256c3b6b555b3762b863406e1ea056f1e\n",
567
- " Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
568
- " Building wheel for iopath (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
569
- " Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31532 sha256=b2aa25855332a5c43eef0c3000cb3b16d8a4c9f2578bc12e110a880bc2523f92\n",
570
- " Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d\n",
571
- " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
572
- " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=376b87330fe481da2030f2484d7d0ac28fbeb173911aef693e27e4bfb094c6a0\n",
573
- " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n",
574
- "Successfully built langdetect iopath antlr4-python3-runtime\n",
575
- "Installing collected packages: sseclient-py, filetype, antlr4-python3-runtime, XlsxWriter, rapidfuzz, python-multipart, python-magic, python-iso639, python-docx, pypdfium2, pypdf, pypandoc, portalocker, pillow, packaging, orjson, onnx, omegaconf, olefile, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, mypy-extensions, langdetect, jsonpointer, jsonpath-python, humanfriendly, emoji, Deprecated, backoff, unstructured.pytesseract, typing-inspect, python-pptx, pytesseract, pillow_heif, pikepdf, pdf2image, nvidia-cusparse-cu12, nvidia-cudnn-cu12, msg-parser, marshmallow, jsonpatch, iopath, coloredlogs, together, pdfminer.six, onnxruntime, nvidia-cusolver-cu12, langsmith, dataclasses-json-speakeasy, dataclasses-json, unstructured-client, pdfplumber, langchain-core, unstructured, layoutparser, langchain-together, langchain-text-splitters, langchain_community, timm, effdet, unstructured-inference\n",
576
- " Attempting uninstall: pillow\n",
577
- " Found existing installation: Pillow 9.4.0\n",
578
- " Uninstalling Pillow-9.4.0:\n",
579
- " Successfully uninstalled Pillow-9.4.0\n",
580
- " Attempting uninstall: packaging\n",
581
- " Found existing installation: packaging 24.0\n",
582
- " Uninstalling packaging-24.0:\n",
583
- " Successfully uninstalled packaging-24.0\n",
584
- "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
585
- "imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 10.3.0 which is incompatible.\u001b[0m\u001b[31m\n",
586
- "\u001b[0mSuccessfully installed Deprecated-1.2.14 XlsxWriter-3.2.0 antlr4-python3-runtime-4.9.3 backoff-2.2.1 coloredlogs-15.0.1 dataclasses-json-0.6.4 dataclasses-json-speakeasy-0.5.11 effdet-0.4.1 emoji-2.11.0 filetype-1.2.0 humanfriendly-10.0 iopath-0.1.10 jsonpatch-1.33 jsonpath-python-1.0.6 jsonpointer-2.4 langchain-core-0.1.42 langchain-text-splitters-0.0.1 langchain-together-0.1.0 langchain_community-0.0.32 langdetect-1.0.9 langsmith-0.1.47 layoutparser-0.3.4 marshmallow-3.21.1 msg-parser-1.2.0 mypy-extensions-1.0.0 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105 olefile-0.47 omegaconf-2.3.0 onnx-1.16.0 onnxruntime-1.15.1 orjson-3.10.0 packaging-23.2 pdf2image-1.17.0 pdfminer.six-20231228 pdfplumber-0.11.0 pikepdf-8.15.0 pillow-10.3.0 pillow_heif-0.16.0 portalocker-2.8.2 pypandoc-1.13 pypdf-4.2.0 pypdfium2-4.29.0 pytesseract-0.3.10 python-docx-1.1.0 python-iso639-2024.2.7 python-magic-0.4.27 python-multipart-0.0.9 python-pptx-0.6.23 rapidfuzz-3.8.1 sseclient-py-1.8.0 timm-0.9.16 together-0.2.11 typing-inspect-0.9.0 unstructured-0.13.2 unstructured-client-0.18.0 unstructured-inference-0.7.25 unstructured.pytesseract-0.3.12\n"
587
- ]
588
- },
589
- {
590
- "output_type": "display_data",
591
- "data": {
592
- "application/vnd.colab-display-data+json": {
593
- "pip_warning": {
594
- "packages": [
595
- "PIL",
596
- "pydevd_plugins"
597
- ]
598
- },
599
- "id": "a05b30d70de54e25a3d0c32fffc55ab0"
600
- }
601
- },
602
- "metadata": {}
603
- }
604
- ]
605
- },
606
- {
607
- "cell_type": "code",
608
- "source": [
609
- "from langchain_community.document_loaders import TextLoader\n",
610
- "from langchain_community.embeddings.fake import FakeEmbeddings\n",
611
- "from langchain_community.vectorstores import Vectara\n",
612
- "from langchain_text_splitters import CharacterTextSplitter"
613
- ],
614
- "metadata": {
615
- "id": "bSRybIQ60tRl"
616
- },
617
- "execution_count": 5,
618
- "outputs": []
619
- },
620
- {
621
- "cell_type": "code",
622
- "source": [
623
- "from google.colab import userdata\n",
624
- "\n",
625
- "TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')\n",
626
- "vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')\n",
627
- "vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')\n",
628
- "vectara_api_key = userdata.get('VECTARA_API_KEY')"
629
- ],
630
- "metadata": {
631
- "id": "d98hRDFC3WyH"
632
- },
633
- "execution_count": 6,
634
- "outputs": []
635
- },
636
- {
637
- "cell_type": "code",
638
- "source": [
639
- "vectorstore = Vectara(\n",
640
- " vectara_customer_id=vectara_customer_id,\n",
641
- " vectara_corpus_id=vectara_corpus_id,\n",
642
- " vectara_api_key=vectara_api_key\n",
643
- " )"
644
- ],
645
- "metadata": {
646
- "id": "n7aGHYcyzgXK"
647
- },
648
- "execution_count": 7,
649
- "outputs": []
650
- },
651
- {
652
- "cell_type": "code",
653
- "source": [
654
- "from langchain_community.document_loaders import UnstructuredPDFLoader"
655
- ],
656
- "metadata": {
657
- "id": "aX5VJiU07RZs"
658
- },
659
- "execution_count": 8,
660
- "outputs": []
661
- },
662
- {
663
- "cell_type": "code",
664
- "source": [
665
- "!mkdir docs\n",
666
- "# upload sample file"
667
- ],
668
- "metadata": {
669
- "id": "UQors5XgGPV7"
670
- },
671
- "execution_count": 37,
672
- "outputs": []
673
- },
674
- {
675
- "cell_type": "code",
676
- "source": [
677
- "loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')\n",
678
- "data = loader.load()"
679
- ],
680
- "metadata": {
681
- "id": "ULSBXZRcI_4R"
682
- },
683
- "execution_count": 9,
684
- "outputs": []
685
- },
686
- {
687
- "cell_type": "code",
688
- "source": [
689
- "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
690
- "docs = text_splitter.split_documents(data)"
691
- ],
692
- "metadata": {
693
- "colab": {
694
- "base_uri": "https://localhost:8080/"
695
- },
696
- "id": "rd_8GLJrPT5T",
697
- "outputId": "002488bd-f8a0-4099-c4c4-b685da7a8195"
698
- },
699
- "execution_count": 10,
700
- "outputs": [
701
- {
702
- "output_type": "stream",
703
- "name": "stderr",
704
- "text": [
705
- "WARNING:langchain_text_splitters.base:Created a chunk of size 3260, which is longer than the specified 1000\n",
706
- "WARNING:langchain_text_splitters.base:Created a chunk of size 1754, which is longer than the specified 1000\n",
707
- "WARNING:langchain_text_splitters.base:Created a chunk of size 1556, which is longer than the specified 1000\n",
708
- "WARNING:langchain_text_splitters.base:Created a chunk of size 2529, which is longer than the specified 1000\n",
709
- "WARNING:langchain_text_splitters.base:Created a chunk of size 2108, which is longer than the specified 1000\n",
710
- "WARNING:langchain_text_splitters.base:Created a chunk of size 1240, which is longer than the specified 1000\n",
711
- "WARNING:langchain_text_splitters.base:Created a chunk of size 1122, which is longer than the specified 1000\n"
712
- ]
713
- }
714
- ]
715
- },
716
- {
717
- "cell_type": "code",
718
- "source": [
719
- "import json\n",
720
- "\n",
721
- "from langchain_community.document_transformers import DoctranPropertyExtractor\n",
722
- "from langchain_core.documents import Document"
723
- ],
724
- "metadata": {
725
- "id": "6CM6bL6JRCCA"
726
- },
727
- "execution_count": 3,
728
- "outputs": []
729
- },
730
- {
731
- "cell_type": "code",
732
- "source": [
733
- "properties = [\n",
734
- " {\n",
735
- " \"name\": \"document_number\",\n",
736
- " \"description\": \"Unique identifier for the document within its project.\",\n",
737
- " \"type\": \"string\",\n",
738
- " \"required\": True\n",
739
- " },\n",
740
- " {\n",
741
- " \"name\": \"discipline\",\n",
742
- " \"description\": \"The discipline associated with the document.\",\n",
743
- " \"type\": \"string\",\n",
744
- " \"required\": True\n",
745
- " },\n",
746
- " {\n",
747
- " \"name\": \"title\",\n",
748
- " \"description\": \"Title of the document.\",\n",
749
- " \"type\": \"string\",\n",
750
- " \"required\": True\n",
751
- " },\n",
752
- " {\n",
753
- " \"name\": \"version\",\n",
754
- " \"description\": \"Version number of the document.\",\n",
755
- " \"type\": \"integer\",\n",
756
- " \"required\": True\n",
757
- " },\n",
758
- " {\n",
759
- " \"name\": \"date\",\n",
760
- " \"description\": \"Creation date of the document.\",\n",
761
- " \"type\": \"string\",\n",
762
- " \"format\": \"date\",\n",
763
- " \"required\": True\n",
764
- " },\n",
765
- " {\n",
766
- " \"name\": \"author\",\n",
767
- " \"description\": \"Author of the document.\",\n",
768
- " \"type\": \"object\",\n",
769
- " \"properties\": {\n",
770
- " \"name\": {\n",
771
- " \"type\": \"string\",\n",
772
- " \"required\": True\n",
773
- " },\n",
774
- " \"email\": {\n",
775
- " \"type\": \"string\",\n",
776
- " \"format\": \"email\",\n",
777
- " \"required\": False\n",
778
- " }\n",
779
- " },\n",
780
- " \"required\": True\n",
781
- " },\n",
782
- " {\n",
783
- " \"name\": \"related_documents\",\n",
784
- " \"description\": \"List of related documents.\",\n",
785
- " \"type\": \"array\",\n",
786
- " \"items\": {\n",
787
- " \"type\": \"string\"\n",
788
- " },\n",
789
- " \"required\": False\n",
790
- " },\n",
791
- " {\n",
792
- " \"name\": \"status\",\n",
793
- " \"description\": \"Current status of the document.\",\n",
794
- " \"type\": \"string\",\n",
795
- " \"enum\": [\"draft\", \"under_review\", \"approved\", \"rejected\"],\n",
796
- " \"required\": True\n",
797
- " },\n",
798
- " {\n",
799
- " \"name\": \"keywords\",\n",
800
- " \"description\": \"Keywords associated with the document.\",\n",
801
- " \"type\": \"array\",\n",
802
- " \"items\": {\n",
803
- " \"type\": \"string\"\n",
804
- " },\n",
805
- " \"required\": False\n",
806
- " },\n",
807
- " {\n",
808
- " \"name\": \"summary\",\n",
809
- " \"description\": \"Short summary of the document content.\",\n",
810
- " \"type\": \"string\",\n",
811
- " \"required\": False\n",
812
- " }\n",
813
- "]"
814
- ],
815
- "metadata": {
816
- "id": "9rBUSiR-bDAD"
817
- },
818
- "execution_count": 12,
819
- "outputs": []
820
- },
821
- {
822
- "cell_type": "code",
823
- "source": [
824
- "property_extractor = DoctranPropertyExtractor(properties=properties)"
825
- ],
826
- "metadata": {
827
- "colab": {
828
- "base_uri": "https://localhost:8080/",
829
- "height": 339
830
- },
831
- "id": "H5jIV-OYfJRg",
832
- "outputId": "0eb7dc55-088e-4912-c85e-dc4eb87e442c"
833
- },
834
- "execution_count": 13,
835
- "outputs": [
836
- {
837
- "output_type": "error",
838
- "ename": "ValueError",
839
- "evalue": "Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter.",
840
- "traceback": [
841
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
842
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
843
- "\u001b[0;32m<ipython-input-13-2ab54c930860>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mproperty_extractor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDoctranPropertyExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
844
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_community/document_transformers/doctran_text_extract.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, properties, openai_api_key, openai_api_model)\u001b[0m\n\u001b[1;32m 57\u001b[0m ) -> None:\n\u001b[1;32m 58\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproperties\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproperties\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m self.openai_api_key = openai_api_key or get_from_env(\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;34m\"openai_api_key\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"OPENAI_API_KEY\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m )\n",
845
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/utils/env.py\u001b[0m in \u001b[0;36mget_from_env\u001b[0;34m(key, env_key, default)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;34mf\"Did not find {key}, please add an environment variable\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;34mf\" `{env_key}` which contains it, or pass\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
846
- "\u001b[0;31mValueError\u001b[0m: Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter."
847
- ]
848
- }
849
- ]
850
- },
851
- {
852
- "cell_type": "markdown",
853
- "source": [],
854
- "metadata": {
855
- "id": "hVjJAK-KTTEE"
856
- }
857
- },
858
- {
859
- "cell_type": "code",
860
- "source": [
861
- "from dotenv import load_dotenv\n",
862
- "\n",
863
- "load_dotenv()"
864
- ],
865
- "metadata": {
866
- "colab": {
867
- "base_uri": "https://localhost:8080/",
868
- "height": 349
869
- },
870
- "id": "HVDCIqIDRJ3Z",
871
- "outputId": "4fbd508a-2179-4251-d2ae-2df5cce24187"
872
- },
873
- "execution_count": 4,
874
- "outputs": [
875
- {
876
- "output_type": "error",
877
- "ename": "ModuleNotFoundError",
878
- "evalue": "No module named 'dotenv'",
879
- "traceback": [
880
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
881
- "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
882
- "\u001b[0;32m<ipython-input-4-c9bdfc1ba4a4>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mdotenv\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
883
- "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'dotenv'",
884
- "",
885
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
886
- ],
887
- "errorDetails": {
888
- "actions": [
889
- {
890
- "action": "open_url",
891
- "actionText": "Open Examples",
892
- "url": "/notebooks/snippets/importing_libraries.ipynb"
893
- }
894
- ]
895
- }
896
- }
897
- ]
898
- }
899
- ]
900
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/langchain_vectara.py DELETED
@@ -1,134 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """langchain_vectara.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
8
- """
9
-
10
- !pip install -r requirements.txt
11
-
12
- !pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif
13
-
14
- from langchain_community.document_loaders import TextLoader
15
- from langchain_community.embeddings.fake import FakeEmbeddings
16
- from langchain_community.vectorstores import Vectara
17
- from langchain_text_splitters import CharacterTextSplitter
18
-
19
- from google.colab import userdata
20
-
21
- TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
22
- vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')
23
- vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
24
- vectara_api_key = userdata.get('VECTARA_API_KEY')
25
-
26
- vectorstore = Vectara(
27
- vectara_customer_id=vectara_customer_id,
28
- vectara_corpus_id=vectara_corpus_id,
29
- vectara_api_key=vectara_api_key
30
- )
31
-
32
- from langchain_community.document_loaders import UnstructuredPDFLoader
33
-
34
- !mkdir docs
35
- # upload sample file
36
-
37
- loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
38
- data = loader.load()
39
-
40
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
41
- docs = text_splitter.split_documents(data)
42
-
43
- import json
44
-
45
- from langchain_community.document_transformers import DoctranPropertyExtractor
46
- from langchain_core.documents import Document
47
-
48
- properties = [
49
- {
50
- "name": "document_number",
51
- "description": "Unique identifier for the document within its project.",
52
- "type": "string",
53
- "required": True
54
- },
55
- {
56
- "name": "discipline",
57
- "description": "The discipline associated with the document.",
58
- "type": "string",
59
- "required": True
60
- },
61
- {
62
- "name": "title",
63
- "description": "Title of the document.",
64
- "type": "string",
65
- "required": True
66
- },
67
- {
68
- "name": "version",
69
- "description": "Version number of the document.",
70
- "type": "integer",
71
- "required": True
72
- },
73
- {
74
- "name": "date",
75
- "description": "Creation date of the document.",
76
- "type": "string",
77
- "format": "date",
78
- "required": True
79
- },
80
- {
81
- "name": "author",
82
- "description": "Author of the document.",
83
- "type": "object",
84
- "properties": {
85
- "name": {
86
- "type": "string",
87
- "required": True
88
- },
89
- "email": {
90
- "type": "string",
91
- "format": "email",
92
- "required": False
93
- }
94
- },
95
- "required": True
96
- },
97
- {
98
- "name": "related_documents",
99
- "description": "List of related documents.",
100
- "type": "array",
101
- "items": {
102
- "type": "string"
103
- },
104
- "required": False
105
- },
106
- {
107
- "name": "status",
108
- "description": "Current status of the document.",
109
- "type": "string",
110
- "enum": ["draft", "under_review", "approved", "rejected"],
111
- "required": True
112
- },
113
- {
114
- "name": "keywords",
115
- "description": "Keywords associated with the document.",
116
- "type": "array",
117
- "items": {
118
- "type": "string"
119
- },
120
- "required": False
121
- },
122
- {
123
- "name": "summary",
124
- "description": "Short summary of the document content.",
125
- "type": "string",
126
- "required": False
127
- }
128
- ]
129
-
130
- property_extractor = DoctranPropertyExtractor(properties=properties)
131
-
132
- from dotenv import load_dotenv
133
-
134
- load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/school_plumbing.txt DELETED
@@ -1,15 +0,0 @@
1
- This document describes the plumbing system for a typical school building. The system includes potable water supply, fixtures and appliances, drainage waste and vent (DWV) systems, and stormwater management.
2
-
3
- Potable Water Supply System:
4
- The potable water supply system consists of incoming water service from the municipal water main, backflow prevention device, water meter, pressure reducing valve, and distribution piping throughout the building to provide water to all fixtures and appliances. All materials used are lead-free and meet local codes and regulations.
5
- Fixtures and Appliances:
6
- Fixtures include sinks, toilets, urinals, drinking fountains, showers, eye wash stations, and laboratory equipment. All fixtures are selected based on ADA compliance, low flow rates, and water conservation standards. Appliances such as dishwashers, ice makers, and clothes washing machines are also provided with hot and cold water connections.
7
- Drainage Waste and Vent (DWV) Systems:
8
- The DWV system consists of sanitary and roof drains, vents, cleanouts, and traps designed to remove wastewater from fixtures and appliances while preventing sewer gases from entering the building. Materials used include PVC or cast iron pipes, fittings, and accessories.
9
- Stormwater Management:
10
- Stormwater management involves collecting rainwater runoff from rooftops and other impervious surfaces, conveying it through underground pipes, and disposing of it into an approved location. This may involve retention/detention ponds, swales, or other methods depending on local codes and regulations.
11
- Maintenance and Inspection:
12
- Regular maintenance and inspection of the plumbing system ensure its longevity and proper functioning. Routine tasks include checking for leaks, testing backflow preventers, inspecting fixtures and appliances, cleaning drains, and ensuring code compliance.
13
- Emergency Procedures:
14
- Emergency procedures should be established and communicated to staff members regarding responding to plumbing emergencies such as flooding, pipe bursts, or gas leaks. These procedures should include shutting off water supplies, contacting emergency responders, and notifying facilities personnel.
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/schulgebäudes.txt DELETED
@@ -1,15 +0,0 @@
1
- Diese Beschreibung umfasst das Rohrleitungssystem eines typischen Schulgebäudes. Das System besteht aus Trinkwasserversorgungs-, Einrichtungen und Geräten, Abwassersammel-, Lüftungs- und Regenwasserbewirtschaftungssystemen.
2
-
3
- Trinkwasserversorgungssystem:
4
- Das Trinkwasserversorgungssystem umfasst die eingehende Wasserzufuhr von der Hauptwasserleitung des Versorgungsunternehmens, einen Rückschlagverhinderer, einen Wasserzähler, eine Druckreduziervorrichtung sowie Verteilungsrohre im Gebäude zur Bereitstellung von Wasser für alle Einrichtungen und Geräte. Alle verwendeten Materialien sind bleifrei und entsprechen lokalen Vorschriften und Bestimmungen.
5
- Einrichtungen und Geräte:
6
- Zu den Einrichtungen gehören Waschbecken, Toiletten, Urinale, Trinkbrunnen, Duschen, Augenduschen und Laborausrüstungen. Alle Einrichtungen werden nach den Kriterien der Barrierefreiheit, niedriger Durchflussraten und Wasser sparenden Standards ausgewählt. Zu den Geräten zählen Geschirrspülmaschinen, Eismaschinen und Kleidungswaschanlagen mit heißem und kaltem Wasseranschluss.
7
- Sammel- und Entlüftungssystem (DWV):
8
- Das DWV-System besteht aus Schmutzwassertonnen, Dachabläufen, Belüftungen, Reinigungsöffnungen und Fallstricken, die dazu dienen, Abwasser von Einrichtungen und Geräten zu entfernen, während gleichzeitig verhindert wird, dass schädliche Gase in das Gebäude eindringen. Die verwendeten Materialien sind PVC oder Graugußrohre, -fittinge und -zubehörteile.
9
- Regenwasserbewirtschaftung:
10
- Die Regenwasserbewirtschaftung sieht vor, Niederschlagswasser von Dächern und anderen wasserundurchlässigen Oberflächen aufzunehmen, durch unterirdische Rohre abzuleiten und es an einem genehmigten Ort wieder abzugeben. Dies kann Bepumpungsanlagen, Retentions-/Entlastungspoldern oder andere Methoden umfassen, je nach lokalen Vorschriften und Bestimmungen.
11
- Wartung und Inspektion:
12
- Routinemäßige Wartung und Inspektion des Rohrleitungssystems gewährleisten seine Lebensdauer und ordnungsgemäße Funktion. Routineaufgaben umfassen das Überprüfen auf Lecks, Testen von Rückschlagverhinderern, Inspizieren von Einrichtungen und Geräten, Reinigen von Abflüssen und Gewährleistung der Codekonformität.
13
- Notfallmaßnahmen:
14
- Notfallmaßnahmen sollten festgelegt und dem Personal bekannt gemacht werden, um auf Plattfälle wie Überschwemmungen, Wasserrohrbrüche oder Gaslecks zu reagieren. Diese Maßnahmen umfassen das Absperren der Wasserzuflüsse, Kontaktaufnahme mit Notdiensten, Benachrichtigung von Einrichtungspersonal und Festlegen klarer Anweisungen für die Mitarbeiter.
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/together_call.py DELETED
@@ -1,44 +0,0 @@
1
- import os
2
- import json
3
- import openai
4
- from pydantic import BaseModel, Field
5
- from dotenv import load_dotenv
6
- load_dotenv()
7
-
8
-
9
- # Create client
10
- client = openai.OpenAI(
11
- base_url="https://api.together.xyz/v1",
12
- api_key=os.environ["TOGETHER_API_KEY"],
13
- )
14
-
15
- # Define the schema for the output.
16
- class User(BaseModel):
17
- name: str = Field(description="user name")
18
- address: str = Field(description="address")
19
-
20
- # Call the LLM with the JSON schema
21
- chat_completion = client.chat.completions.create(
22
- model="mistralai/Mixtral-8x7B-Instruct-v0.1",
23
- response_format={"type": "json_object", "schema": User.model_json_schema()},
24
- messages=[
25
- {
26
- "role": "system",
27
- "content": "You are a helpful assistant that answers in JSON.",
28
- },
29
- {
30
- "role": "user",
31
- "content": "Create a user named Alice, who lives in 42, Wonderland Avenue.",
32
- },
33
- ],
34
- )
35
-
36
- created_user = json.loads(chat_completion.choices[0].message.content)
37
- print(json.dumps(created_user, indent=2))
38
-
39
- """
40
- {
41
- "address": "42, Wonderland Avenue",
42
- "name": "Alice"
43
- }
44
- """