Carlos Salgado commited on
Commit
0c2a143
1 Parent(s): 429ff5f

add working langhchain drafts and requirements file

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. langchain_vectara.ipynb +900 -0
  3. langchain_vectara.py +134 -0
  4. requirements.txt +9 -1
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .envrc
2
+ .direnv/
3
+ flake.*
langchain_vectara.ipynb ADDED
@@ -0,0 +1,900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "source": [
20
+ "!pip install -r requirements.txt"
21
+ ],
22
+ "metadata": {
23
+ "colab": {
24
+ "base_uri": "https://localhost:8080/"
25
+ },
26
+ "id": "lVZX4hy1Ruq_",
27
+ "outputId": "0c963932-2266-4c44-d671-07dc23625bae"
28
+ },
29
+ "execution_count": 2,
30
+ "outputs": [
31
+ {
32
+ "output_type": "stream",
33
+ "name": "stdout",
34
+ "text": [
35
+ "Requirement already satisfied: langchain_community in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 1)) (0.0.32)\n",
36
+ "Requirement already satisfied: langchain-text-splitters in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (0.0.1)\n",
37
+ "Requirement already satisfied: langchain-together in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (0.1.0)\n",
38
+ "Requirement already satisfied: pdf2image in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 5)) (1.17.0)\n",
39
+ "Requirement already satisfied: pdfminer.six in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (20231228)\n",
40
+ "Requirement already satisfied: pillow_heif in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (0.16.0)\n",
41
+ "Requirement already satisfied: doctran in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 8)) (0.0.14)\n",
42
+ "Collecting python-dotenv (from -r requirements.txt (line 9))\n",
43
+ " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
44
+ "Requirement already satisfied: unstructured[local-inference] in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (0.13.2)\n",
45
+ "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (6.0.1)\n",
46
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (2.0.29)\n",
47
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (3.9.3)\n",
48
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.6.4)\n",
49
+ "Requirement already satisfied: langchain-core<0.2.0,>=0.1.41 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.1.42)\n",
50
+ "Requirement already satisfied: langsmith<0.2.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (0.1.47)\n",
51
+ "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (1.25.2)\n",
52
+ "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (2.31.0)\n",
53
+ "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community->-r requirements.txt (line 1)) (8.2.3)\n",
54
+ "Requirement already satisfied: together<0.3.0,>=0.2.10 in /usr/local/lib/python3.10/dist-packages (from langchain-together->-r requirements.txt (line 3)) (0.2.11)\n",
55
+ "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (5.2.0)\n",
56
+ "Requirement already satisfied: filetype in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.2.0)\n",
57
+ "Requirement already satisfied: python-magic in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.4.27)\n",
58
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.9.4)\n",
59
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.8.1)\n",
60
+ "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.9.0)\n",
61
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.12.3)\n",
62
+ "Requirement already satisfied: emoji in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.11.0)\n",
63
+ "Requirement already satisfied: python-iso639 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2024.2.7)\n",
64
+ "Requirement already satisfied: langdetect in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.0.9)\n",
65
+ "Requirement already satisfied: rapidfuzz in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.8.1)\n",
66
+ "Requirement already satisfied: backoff in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.2.1)\n",
67
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.11.0)\n",
68
+ "Requirement already satisfied: unstructured-client<=0.18.0 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.18.0)\n",
69
+ "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.14.1)\n",
70
+ "Requirement already satisfied: unstructured.pytesseract>=0.3.12 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.3.12)\n",
71
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.3)\n",
72
+ "Requirement already satisfied: pypdf in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (4.2.0)\n",
73
+ "Requirement already satisfied: python-pptx<=0.6.23 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.6.23)\n",
74
+ "Requirement already satisfied: pypandoc in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.13)\n",
75
+ "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.6)\n",
76
+ "Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (3.1.2)\n",
77
+ "Requirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.0.1)\n",
78
+ "Requirement already satisfied: python-docx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.1.0)\n",
79
+ "Requirement already satisfied: pikepdf in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (8.15.0)\n",
80
+ "Requirement already satisfied: unstructured-inference==0.7.25 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (0.7.25)\n",
81
+ "Requirement already satisfied: onnx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.16.0)\n",
82
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (2.0.3)\n",
83
+ "Requirement already satisfied: msg-parser in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]->-r requirements.txt (line 4)) (1.2.0)\n",
84
+ "Requirement already satisfied: layoutparser[layoutmodels,tesseract] in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.3.4)\n",
85
+ "Requirement already satisfied: python-multipart in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.0.9)\n",
86
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (0.20.3)\n",
87
+ "Requirement already satisfied: opencv-python!=4.7.0.68 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (4.8.0.76)\n",
88
+ "Requirement already satisfied: onnxruntime<1.16 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (1.15.1)\n",
89
+ "Requirement already satisfied: transformers>=4.25.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]->-r requirements.txt (line 4)) (4.38.2)\n",
90
+ "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image->-r requirements.txt (line 5)) (10.3.0)\n",
91
+ "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six->-r requirements.txt (line 6)) (3.3.2)\n",
92
+ "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six->-r requirements.txt (line 6)) (42.0.5)\n",
93
+ "Requirement already satisfied: openai<0.28.0,>=0.27.8 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (0.27.10)\n",
94
+ "Requirement already satisfied: presidio-analyzer<3.0.0,>=2.2.33 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (2.2.354)\n",
95
+ "Requirement already satisfied: presidio-anonymizer<3.0.0,>=2.2.33 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (2.2.354)\n",
96
+ "Requirement already satisfied: pydantic<2.0.0,>=1.10.9 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (1.10.15)\n",
97
+ "Requirement already satisfied: spacy<4.0.0,>=3.5.4 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (3.7.4)\n",
98
+ "Requirement already satisfied: tiktoken<0.6.0,>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (0.5.2)\n",
99
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.3.1)\n",
100
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (23.2.0)\n",
101
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.4.1)\n",
102
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (6.0.5)\n",
103
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (1.9.4)\n",
104
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community->-r requirements.txt (line 1)) (4.0.3)\n",
105
+ "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six->-r requirements.txt (line 6)) (1.16.0)\n",
106
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community->-r requirements.txt (line 1)) (3.21.1)\n",
107
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community->-r requirements.txt (line 1)) (0.9.0)\n",
108
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community->-r requirements.txt (line 1)) (1.33)\n",
109
+ "Requirement already satisfied: packaging<24.0,>=23.2 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community->-r requirements.txt (line 1)) (23.2)\n",
110
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /usr/local/lib/python3.10/dist-packages (from langsmith<0.2.0,>=0.1.0->langchain_community->-r requirements.txt (line 1)) (3.10.0)\n",
111
+ "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai<0.28.0,>=0.27.8->doctran->-r requirements.txt (line 8)) (4.66.2)\n",
112
+ "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (2023.12.25)\n",
113
+ "Requirement already satisfied: tldextract in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (5.1.2)\n",
114
+ "Requirement already satisfied: phonenumbers<9.0.0,>=8.12 in /usr/local/lib/python3.10/dist-packages (from presidio-analyzer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (8.13.34)\n",
115
+ "Requirement already satisfied: pycryptodome>=3.10.1 in /usr/local/lib/python3.10/dist-packages (from presidio-anonymizer<3.0.0,>=2.2.33->doctran->-r requirements.txt (line 8)) (3.20.0)\n",
116
+ "Requirement already satisfied: XlsxWriter>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from python-pptx<=0.6.23->unstructured[local-inference]->-r requirements.txt (line 4)) (3.2.0)\n",
117
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (3.6)\n",
118
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (2.0.7)\n",
119
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community->-r requirements.txt (line 1)) (2024.2.2)\n",
120
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.0.12)\n",
121
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.0.5)\n",
122
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.0.10)\n",
123
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.0.8)\n",
124
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.0.9)\n",
125
+ "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (8.2.3)\n",
126
+ "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (1.1.2)\n",
127
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.4.8)\n",
128
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (2.0.10)\n",
129
+ "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (0.3.4)\n",
130
+ "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (0.9.4)\n",
131
+ "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (6.4.0)\n",
132
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.1.3)\n",
133
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (69.5.0)\n",
134
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8)) (3.3.0)\n",
135
+ "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community->-r requirements.txt (line 1)) (3.0.3)\n",
136
+ "INFO: pip is looking at multiple versions of together to determine which version is compatible with other requirements. This could take a while.\n",
137
+ "Collecting together<0.3.0,>=0.2.10 (from langchain-together->-r requirements.txt (line 3))\n",
138
+ " Downloading together-0.2.10-py3-none-any.whl.metadata (26 kB)\n",
139
+ "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
140
+ " Downloading spacy-3.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)\n",
141
+ " Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
142
+ " Downloading spacy-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
143
+ "Collecting pathy>=0.10.0 (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8))\n",
144
+ " Downloading pathy-0.11.0-py3-none-any.whl.metadata (16 kB)\n",
145
+ "INFO: pip is still looking at multiple versions of together to determine which version is compatible with other requirements. This could take a while.\n",
146
+ "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
147
+ " Downloading spacy-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
148
+ " Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
149
+ "Collecting thinc<8.2.0,>=8.1.8 (from spacy<4.0.0,>=3.5.4->doctran->-r requirements.txt (line 8))\n",
150
+ " Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)\n",
151
+ "Collecting spacy<4.0.0,>=3.5.4 (from doctran->-r requirements.txt (line 8))\n",
152
+ " Downloading spacy-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\n",
153
+ "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n",
154
+ " Downloading spacy-3.5.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
155
+ "Collecting presidio-analyzer<3.0.0,>=2.2.33 (from doctran->-r requirements.txt (line 8))\n",
156
+ " Downloading presidio_analyzer-2.2.354-py3-none-any.whl.metadata (2.6 kB)\n",
157
+ "Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community->-r requirements.txt (line 1))\n",
158
+ " Downloading langsmith-0.1.47-py3-none-any.whl.metadata (13 kB)\n",
159
+ " Downloading langsmith-0.1.46-py3-none-any.whl.metadata (13 kB)\n",
160
+ " Downloading langsmith-0.1.45-py3-none-any.whl.metadata (13 kB)\n",
161
+ " Downloading langsmith-0.1.44-py3-none-any.whl.metadata (13 kB)\n",
162
+ " Downloading langsmith-0.1.43-py3-none-any.whl.metadata (13 kB)\n",
163
+ " Downloading langsmith-0.1.42-py3-none-any.whl.metadata (13 kB)\n",
164
+ " Downloading langsmith-0.1.41-py3-none-any.whl.metadata (13 kB)\n",
165
+ " Downloading langsmith-0.1.40-py3-none-any.whl.metadata (13 kB)\n",
166
+ " Downloading langsmith-0.1.39-py3-none-any.whl.metadata (13 kB)\n",
167
+ " Downloading langsmith-0.1.38-py3-none-any.whl.metadata (13 kB)\n",
168
+ " Downloading langsmith-0.1.37-py3-none-any.whl.metadata (13 kB)\n",
169
+ " Downloading langsmith-0.1.36-py3-none-any.whl.metadata (13 kB)\n",
170
+ " Downloading langsmith-0.1.35-py3-none-any.whl.metadata (13 kB)\n",
171
+ " Downloading langsmith-0.1.34-py3-none-any.whl.metadata (13 kB)\n",
172
+ " Downloading langsmith-0.1.33-py3-none-any.whl.metadata (13 kB)\n",
173
+ " Downloading langsmith-0.1.31-py3-none-any.whl.metadata (13 kB)\n",
174
+ " Downloading langsmith-0.1.30-py3-none-any.whl.metadata (13 kB)\n",
175
+ " Downloading langsmith-0.1.29-py3-none-any.whl.metadata (13 kB)\n",
176
+ " Downloading langsmith-0.1.28-py3-none-any.whl.metadata (13 kB)\n",
177
+ " Downloading langsmith-0.1.27-py3-none-any.whl.metadata (13 kB)\n",
178
+ " Downloading langsmith-0.1.26-py3-none-any.whl.metadata (13 kB)\n",
179
+ " Downloading langsmith-0.1.25-py3-none-any.whl.metadata (13 kB)\n",
180
+ " Downloading langsmith-0.1.24-py3-none-any.whl.metadata (13 kB)\n",
181
+ " Downloading langsmith-0.1.23-py3-none-any.whl.metadata (13 kB)\n",
182
+ " Downloading langsmith-0.1.22-py3-none-any.whl.metadata (13 kB)\n",
183
+ " Downloading langsmith-0.1.21-py3-none-any.whl.metadata (13 kB)\n",
184
+ " Downloading langsmith-0.1.20-py3-none-any.whl.metadata (13 kB)\n",
185
+ " Downloading langsmith-0.1.19-py3-none-any.whl.metadata (13 kB)\n",
186
+ " Downloading langsmith-0.1.18-py3-none-any.whl.metadata (13 kB)\n",
187
+ " Downloading langsmith-0.1.17-py3-none-any.whl.metadata (13 kB)\n",
188
+ " Downloading langsmith-0.1.16-py3-none-any.whl.metadata (13 kB)\n",
189
+ " Downloading langsmith-0.1.15-py3-none-any.whl.metadata (13 kB)\n",
190
+ " Downloading langsmith-0.1.14-py3-none-any.whl.metadata (13 kB)\n",
191
+ " Downloading langsmith-0.1.13-py3-none-any.whl.metadata (13 kB)\n",
192
+ " Downloading langsmith-0.1.12-py3-none-any.whl.metadata (13 kB)\n",
193
+ " Downloading langsmith-0.1.11-py3-none-any.whl.metadata (13 kB)\n",
194
+ " Downloading langsmith-0.1.10-py3-none-any.whl.metadata (13 kB)\n",
195
+ " Downloading langsmith-0.1.9-py3-none-any.whl.metadata (13 kB)\n",
196
+ " Downloading langsmith-0.1.8-py3-none-any.whl.metadata (13 kB)\n",
197
+ " Downloading langsmith-0.1.7-py3-none-any.whl.metadata (13 kB)\n",
198
+ " Downloading langsmith-0.1.6-py3-none-any.whl.metadata (13 kB)\n",
199
+ " Downloading langsmith-0.1.5-py3-none-any.whl.metadata (13 kB)\n",
200
+ " Downloading langsmith-0.1.4-py3-none-any.whl.metadata (13 kB)\n",
201
+ " Downloading langsmith-0.1.3-py3-none-any.whl.metadata (13 kB)\n",
202
+ " Downloading langsmith-0.1.2-py3-none-any.whl.metadata (13 kB)\n",
203
+ " Downloading langsmith-0.1.1-py3-none-any.whl.metadata (13 kB)\n",
204
+ " Downloading langsmith-0.1.0-py3-none-any.whl.metadata (13 kB)\n",
205
+ "Collecting langchain-core<0.2.0,>=0.1.41 (from langchain_community->-r requirements.txt (line 1))\n",
206
+ " Downloading langchain_core-0.1.42-py3-none-any.whl.metadata (5.9 kB)\n",
207
+ " Downloading langchain_core-0.1.41-py3-none-any.whl.metadata (5.9 kB)\n",
208
+ "Collecting doctran (from -r requirements.txt (line 8))\n",
209
+ " Downloading doctran-0.0.14-py3-none-any.whl.metadata (8.6 kB)\n",
210
+ " Downloading doctran-0.0.13-py3-none-any.whl.metadata (8.6 kB)\n",
211
+ " Downloading doctran-0.0.12-py3-none-any.whl.metadata (8.6 kB)\n",
212
+ " Downloading doctran-0.0.11-py3-none-any.whl.metadata (8.5 kB)\n",
213
+ " Downloading doctran-0.0.10-py3-none-any.whl.metadata (8.9 kB)\n",
214
+ "Collecting tiktoken<0.5.0,>=0.4.0 (from doctran->-r requirements.txt (line 8))\n",
215
+ " Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
216
+ "Collecting doctran (from -r requirements.txt (line 8))\n",
217
+ " Downloading doctran-0.0.9-py3-none-any.whl.metadata (8.9 kB)\n",
218
+ "Collecting tiktoken<0.4.0,>=0.3.3 (from doctran->-r requirements.txt (line 8))\n",
219
+ " Downloading tiktoken-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n",
220
+ "Collecting doctran (from -r requirements.txt (line 8))\n",
221
+ " Downloading doctran-0.0.8-py3-none-any.whl.metadata (8.9 kB)\n",
222
+ " Downloading doctran-0.0.7-py3-none-any.whl.metadata (8.9 kB)\n",
223
+ " Downloading doctran-0.0.6-py3-none-any.whl.metadata (8.9 kB)\n",
224
+ " Downloading doctran-0.0.5-py3-none-any.whl.metadata (8.9 kB)\n",
225
+ " Downloading doctran-0.0.4-py3-none-any.whl.metadata (8.9 kB)\n",
226
+ " Downloading doctran-0.0.3-py3-none-any.whl.metadata (8.8 kB)\n",
227
+ " Downloading doctran-0.0.2-py3-none-any.whl.metadata (8.9 kB)\n",
228
+ "Collecting bs4<0.0.2,>=0.0.1 (from doctran->-r requirements.txt (line 8))\n",
229
+ " Downloading bs4-0.0.1.tar.gz (1.1 kB)\n",
230
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
231
+ "Requirement already satisfied: jsonschema<5.0.0,>=4.17.3 in /usr/local/lib/python3.10/dist-packages (from doctran->-r requirements.txt (line 8)) (4.19.2)\n",
232
+ "Collecting mailbox<0.5,>=0.4 (from doctran->-r requirements.txt (line 8))\n",
233
+ " Downloading mailbox-0.4.tar.gz (4.1 kB)\n",
234
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
235
+ "Collecting pdfplumber<0.10.0,>=0.9.0 (from doctran->-r requirements.txt (line 8))\n",
236
+ " Downloading pdfplumber-0.9.0-py3-none-any.whl.metadata (35 kB)\n",
237
+ "Collecting doctran (from -r requirements.txt (line 8))\n",
238
+ " Downloading doctran-0.0.1-py3-none-any.whl.metadata (5.5 kB)\n",
239
+ " Downloading doctran-0.0.0-py3-none-any.whl.metadata (599 bytes)\n",
240
+ "Collecting langchain-together (from -r requirements.txt (line 3))\n",
241
+ " Downloading langchain_together-0.1.0-py3-none-any.whl.metadata (1.9 kB)\n",
242
+ " Downloading langchain_together-0.0.2.post2-py3-none-any.whl.metadata (1.9 kB)\n",
243
+ " Downloading langchain_together-0.0.2.post1-py3-none-any.whl.metadata (806 bytes)\n",
244
+ " Downloading langchain_together-0.0.2-py3-none-any.whl.metadata (577 bytes)\n",
245
+ " Downloading langchain_together-0.0.1-py3-none-any.whl.metadata (504 bytes)\n",
246
+ "Collecting langchain-text-splitters (from -r requirements.txt (line 2))\n",
247
+ " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)\n",
248
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
249
+ " Downloading langchain_community-0.0.32-py3-none-any.whl.metadata (8.5 kB)\n",
250
+ " Downloading langchain_community-0.0.31-py3-none-any.whl.metadata (8.4 kB)\n",
251
+ "Collecting langchain-core<0.2.0,>=0.1.37 (from langchain_community->-r requirements.txt (line 1))\n",
252
+ " Downloading langchain_core-0.1.40-py3-none-any.whl.metadata (5.9 kB)\n",
253
+ " Downloading langchain_core-0.1.39-py3-none-any.whl.metadata (5.9 kB)\n",
254
+ " Downloading langchain_core-0.1.38-py3-none-any.whl.metadata (6.0 kB)\n",
255
+ " Downloading langchain_core-0.1.37-py3-none-any.whl.metadata (6.0 kB)\n",
256
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
257
+ " Downloading langchain_community-0.0.30-py3-none-any.whl.metadata (8.4 kB)\n",
258
+ " Downloading langchain_community-0.0.29-py3-none-any.whl.metadata (8.3 kB)\n",
259
+ "Collecting langchain-core<0.2.0,>=0.1.33 (from langchain_community->-r requirements.txt (line 1))\n",
260
+ " Downloading langchain_core-0.1.36-py3-none-any.whl.metadata (6.0 kB)\n",
261
+ " Downloading langchain_core-0.1.35-py3-none-any.whl.metadata (6.0 kB)\n",
262
+ " Downloading langchain_core-0.1.34-py3-none-any.whl.metadata (6.0 kB)\n",
263
+ " Downloading langchain_core-0.1.33-py3-none-any.whl.metadata (6.0 kB)\n",
264
+ "Requirement already satisfied: anyio<5,>=3 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.33->langchain_community->-r requirements.txt (line 1)) (3.7.1)\n",
265
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
266
+ " Downloading langchain_community-0.0.28-py3-none-any.whl.metadata (8.3 kB)\n",
267
+ "Collecting langchain-core<0.2.0,>=0.1.31 (from langchain_community->-r requirements.txt (line 1))\n",
268
+ " Downloading langchain_core-0.1.32-py3-none-any.whl.metadata (6.0 kB)\n",
269
+ " Downloading langchain_core-0.1.31-py3-none-any.whl.metadata (6.0 kB)\n",
270
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
271
+ " Downloading langchain_community-0.0.27-py3-none-any.whl.metadata (8.2 kB)\n",
272
+ "Collecting langchain-core<0.2.0,>=0.1.30 (from langchain_community->-r requirements.txt (line 1))\n",
273
+ " Downloading langchain_core-0.1.30-py3-none-any.whl.metadata (6.0 kB)\n",
274
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
275
+ " Downloading langchain_community-0.0.26-py3-none-any.whl.metadata (8.2 kB)\n",
276
+ "Collecting langchain-core<0.2.0,>=0.1.29 (from langchain_community->-r requirements.txt (line 1))\n",
277
+ " Downloading langchain_core-0.1.29-py3-none-any.whl.metadata (6.0 kB)\n",
278
+ "Collecting langchain_community (from -r requirements.txt (line 1))\n",
279
+ " Downloading langchain_community-0.0.25-py3-none-any.whl.metadata (8.1 kB)\n",
280
+ "\u001b[31mERROR: Exception:\n",
281
+ "Traceback (most recent call last):\n",
282
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 180, in exc_logging_wrapper\n",
283
+ " status = run_func(*args)\n",
284
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py\", line 245, in wrapper\n",
285
+ " return func(self, options, args)\n",
286
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py\", line 377, in run\n",
287
+ " requirement_set = resolver.resolve(\n",
288
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py\", line 95, in resolve\n",
289
+ " result = self._result = resolver.resolve(\n",
290
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py\", line 546, in resolve\n",
291
+ " state = resolution.resolve(requirements, max_rounds=max_rounds)\n",
292
+ " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py\", line 457, in resolve\n",
293
+ " raise ResolutionTooDeep(max_rounds)\n",
294
+ "pip._vendor.resolvelib.resolvers.ResolutionTooDeep: 200000\u001b[0m\u001b[31m\n",
295
+ "\u001b[0m"
296
+ ]
297
+ }
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "source": [
303
+ "!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif"
304
+ ],
305
+ "metadata": {
306
+ "colab": {
307
+ "base_uri": "https://localhost:8080/",
308
+ "height": 1000
309
+ },
310
+ "id": "j06J9xE60u0C",
311
+ "outputId": "06248856-b7d5-402f-f38d-03e475f2786b"
312
+ },
313
+ "execution_count": 1,
314
+ "outputs": [
315
+ {
316
+ "output_type": "stream",
317
+ "name": "stdout",
318
+ "text": [
319
+ "Collecting langchain_community\n",
320
+ " Downloading langchain_community-0.0.32-py3-none-any.whl (1.9 MB)\n",
321
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
322
+ "\u001b[?25hCollecting langchain-text-splitters\n",
323
+ " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n",
324
+ "Collecting unstructured[local-inference]\n",
325
+ " Downloading unstructured-0.13.2-py3-none-any.whl (1.9 MB)\n",
326
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
327
+ "\u001b[?25hCollecting pdf2image\n",
328
+ " Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
329
+ "Collecting pdfminer.six\n",
330
+ " Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n",
331
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
332
+ "\u001b[?25hCollecting langchain-together\n",
333
+ " Downloading langchain_together-0.1.0-py3-none-any.whl (6.7 kB)\n",
334
+ "Collecting pillow_heif\n",
335
+ " Downloading pillow_heif-0.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.5 MB)\n",
336
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.5/7.5 MB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
337
+ "\u001b[?25hRequirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (6.0.1)\n",
338
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.0.29)\n",
339
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (3.9.3)\n",
340
+ "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)\n",
341
+ " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n",
342
+ "Collecting langchain-core<0.2.0,>=0.1.41 (from langchain_community)\n",
343
+ " Downloading langchain_core-0.1.42-py3-none-any.whl (287 kB)\n",
344
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.5/287.5 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
345
+ "\u001b[?25hCollecting langsmith<0.2.0,>=0.1.0 (from langchain_community)\n",
346
+ " Downloading langsmith-0.1.47-py3-none-any.whl (113 kB)\n",
347
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.0/113.0 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
348
+ "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (1.25.2)\n",
349
+ "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (2.31.0)\n",
350
+ "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain_community) (8.2.3)\n",
351
+ "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (5.2.0)\n",
352
+ "Collecting filetype (from unstructured[local-inference])\n",
353
+ " Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)\n",
354
+ "Collecting python-magic (from unstructured[local-inference])\n",
355
+ " Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)\n",
356
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.9.4)\n",
357
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.8.1)\n",
358
+ "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (0.9.0)\n",
359
+ "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.12.3)\n",
360
+ "Collecting emoji (from unstructured[local-inference])\n",
361
+ " Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)\n",
362
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m433.8/433.8 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
363
+ "\u001b[?25hCollecting python-iso639 (from unstructured[local-inference])\n",
364
+ " Downloading python_iso639-2024.2.7-py3-none-any.whl (274 kB)\n",
365
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
366
+ "\u001b[?25hCollecting langdetect (from unstructured[local-inference])\n",
367
+ " Downloading langdetect-1.0.9.tar.gz (981 kB)\n",
368
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m38.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
369
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
370
+ "Collecting rapidfuzz (from unstructured[local-inference])\n",
371
+ " Downloading rapidfuzz-3.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
372
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
373
+ "\u001b[?25hCollecting backoff (from unstructured[local-inference])\n",
374
+ " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
375
+ "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (4.11.0)\n",
376
+ "Collecting unstructured-client<=0.18.0 (from unstructured[local-inference])\n",
377
+ " Downloading unstructured_client-0.18.0-py3-none-any.whl (21 kB)\n",
378
+ "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (1.14.1)\n",
379
+ "Collecting unstructured.pytesseract>=0.3.12 (from unstructured[local-inference])\n",
380
+ " Downloading unstructured.pytesseract-0.3.12-py3-none-any.whl (14 kB)\n",
381
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.3)\n",
382
+ "Collecting pypdf (from unstructured[local-inference])\n",
383
+ " Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)\n",
384
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.4/290.4 kB\u001b[0m \u001b[31m20.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
385
+ "\u001b[?25hCollecting python-pptx<=0.6.23 (from unstructured[local-inference])\n",
386
+ " Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)\n",
387
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
388
+ "\u001b[?25hCollecting pypandoc (from unstructured[local-inference])\n",
389
+ " Downloading pypandoc-1.13-py3-none-any.whl (21 kB)\n",
390
+ "Requirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.6)\n",
391
+ "Requirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (3.1.2)\n",
392
+ "Requirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (2.0.1)\n",
393
+ "Collecting python-docx (from unstructured[local-inference])\n",
394
+ " Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)\n",
395
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.6/239.6 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
396
+ "\u001b[?25hCollecting pikepdf (from unstructured[local-inference])\n",
397
+ " Downloading pikepdf-8.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n",
398
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
399
+ "\u001b[?25hCollecting unstructured-inference==0.7.25 (from unstructured[local-inference])\n",
400
+ " Downloading unstructured_inference-0.7.25-py3-none-any.whl (58 kB)\n",
401
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.9/58.9 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
402
+ "\u001b[?25hCollecting onnx (from unstructured[local-inference])\n",
403
+ " Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)\n",
404
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
405
+ "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from unstructured[local-inference]) (2.0.3)\n",
406
+ "Collecting msg-parser (from unstructured[local-inference])\n",
407
+ " Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)\n",
408
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━���━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
409
+ "\u001b[?25hCollecting layoutparser[layoutmodels,tesseract] (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
410
+ " Downloading layoutparser-0.3.4-py3-none-any.whl (19.2 MB)\n",
411
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.2/19.2 MB\u001b[0m \u001b[31m39.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
412
+ "\u001b[?25hCollecting python-multipart (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
413
+ " Downloading python_multipart-0.0.9-py3-none-any.whl (22 kB)\n",
414
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (0.20.3)\n",
415
+ "Requirement already satisfied: opencv-python!=4.7.0.68 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (4.8.0.76)\n",
416
+ "Collecting onnxruntime<1.16 (from unstructured-inference==0.7.25->unstructured[local-inference])\n",
417
+ " Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n",
418
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m58.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
419
+ "\u001b[?25hRequirement already satisfied: transformers>=4.25.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.25->unstructured[local-inference]) (4.38.2)\n",
420
+ "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (9.4.0)\n",
421
+ "Requirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (3.3.2)\n",
422
+ "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (42.0.5)\n",
423
+ "Collecting together<0.3.0,>=0.2.10 (from langchain-together)\n",
424
+ " Downloading together-0.2.11-py3-none-any.whl (43 kB)\n",
425
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
426
+ "\u001b[?25hCollecting pillow (from pdf2image)\n",
427
+ " Downloading pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)\n",
428
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
429
+ "\u001b[?25hRequirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.3.1)\n",
430
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (23.2.0)\n",
431
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.4.1)\n",
432
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (6.0.5)\n",
433
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (1.9.4)\n",
434
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain_community) (4.0.3)\n",
435
+ "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six) (1.16.0)\n",
436
+ "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
437
+ " Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n",
438
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
439
+ "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)\n",
440
+ " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n",
441
+ "Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
442
+ " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n",
443
+ "Collecting packaging<24.0,>=23.2 (from langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
444
+ " Downloading packaging-23.2-py3-none-any.whl (53 kB)\n",
445
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
446
+ "\u001b[?25hRequirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain-core<0.2.0,>=0.1.41->langchain_community) (2.6.4)\n",
447
+ "Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.0->langchain_community)\n",
448
+ " Downloading orjson-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n",
449
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.8/144.8 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
450
+ "\u001b[?25hCollecting XlsxWriter>=0.5.7 (from python-pptx<=0.6.23->unstructured[local-inference])\n",
451
+ " Downloading XlsxWriter-3.2.0-py3-none-any.whl (159 kB)\n",
452
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m159.9/159.9 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
453
+ "\u001b[?25hRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (3.6)\n",
454
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2.0.7)\n",
455
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain_community) (2024.2.2)\n",
456
+ "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain_community) (3.0.3)\n",
457
+ "Collecting sseclient-py<2.0.0,>=1.7.2 (from together<0.3.0,>=0.2.10->langchain-together)\n",
458
+ " Downloading sseclient_py-1.8.0-py2.py3-none-any.whl (8.8 kB)\n",
459
+ "Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in /usr/local/lib/python3.10/dist-packages (from together<0.3.0,>=0.2.10->langchain-together) (4.66.2)\n",
460
+ "Requirement already satisfied: typer<0.10.0,>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from together<0.3.0,>=0.2.10->langchain-together) (0.9.4)\n",
461
+ "Collecting dataclasses-json-speakeasy>=0.5.11 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
462
+ " Downloading dataclasses_json_speakeasy-0.5.11-py3-none-any.whl (28 kB)\n",
463
+ "Collecting jsonpath-python>=1.0.6 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
464
+ " Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)\n",
465
+ "Collecting mypy-extensions>=1.0.0 (from unstructured-client<=0.18.0->unstructured[local-inference])\n",
466
+ " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n",
467
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from unstructured-client<=0.18.0->unstructured[local-inference]) (2.8.2)\n",
468
+ "Requirement already satisfied: six>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from unstructured-client<=0.18.0->unstructured[local-inference]) (1.16.0)\n",
469
+ "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->unstructured[local-inference]) (2.5)\n",
470
+ "Collecting olefile>=0.46 (from msg-parser->unstructured[local-inference])\n",
471
+ " Downloading olefile-0.47-py2.py3-none-any.whl (114 kB)\n",
472
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.6/114.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
473
+ "\u001b[?25hRequirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (8.1.7)\n",
474
+ "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (1.4.0)\n",
475
+ "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[local-inference]) (2023.12.25)\n",
476
+ "Requirement already satisfied: protobuf>=3.20.2 in /usr/local/lib/python3.10/dist-packages (from onnx->unstructured[local-inference]) (3.20.3)\n",
477
+ "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl->unstructured[local-inference]) (1.1.0)\n",
478
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->unstructured[local-inference]) (2023.4)\n",
479
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->unstructured[local-inference]) (2024.1)\n",
480
+ "Collecting Deprecated (from pikepdf->unstructured[local-inference])\n",
481
+ " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n",
482
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six) (2.22)\n",
483
+ "Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.2.0,>=0.1.41->langchain_community)\n",
484
+ " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n",
485
+ "Collecting coloredlogs (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference])\n",
486
+ " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
487
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
488
+ "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (24.3.25)\n",
489
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (1.12)\n",
490
+ "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.2.0,>=0.1.41->langchain_community) (0.6.0)\n",
491
+ "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->langchain-core<0.2.0,>=0.1.41->langchain_community) (2.16.3)\n",
492
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (3.13.4)\n",
493
+ "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (0.15.2)\n",
494
+ "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.25->unstructured[local-inference]) (0.4.2)\n",
495
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->unstructured-inference==0.7.25->unstructured[local-inference]) (2023.6.0)\n",
496
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.11.4)\n",
497
+ "Collecting iopath (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
498
+ " Downloading iopath-0.1.10.tar.gz (42 kB)\n",
499
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
500
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
501
+ "Collecting pdfplumber (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
502
+ " Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)\n",
503
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.4/56.4 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
504
+ "\u001b[?25hCollecting pytesseract (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
505
+ " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
506
+ "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.2.1+cu121)\n",
507
+ "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (0.17.1+cu121)\n",
508
+ "Collecting effdet (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
509
+ " Downloading effdet-0.4.1-py3-none-any.whl (112 kB)\n",
510
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.5/112.5 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
511
+ "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference])\n",
512
+ " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
513
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
514
+ "\u001b[?25hCollecting timm>=0.9.2 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
515
+ " Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)\n",
516
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m52.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
517
+ "\u001b[?25hRequirement already satisfied: pycocotools>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.0.7)\n",
518
+ "Collecting omegaconf>=2.0 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
519
+ " Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
520
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
521
+ "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.1.3)\n",
522
+ "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
523
+ " Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
524
+ "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
525
+ " Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
526
+ "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
527
+ " Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
528
+ "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
529
+ " Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
530
+ "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
531
+ " Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
532
+ "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
533
+ " Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
534
+ "Collecting nvidia-curand-cu12==10.3.2.106 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
535
+ " Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
536
+ "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
537
+ " Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
538
+ "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
539
+ " Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
540
+ "Collecting nvidia-nccl-cu12==2.19.3 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
541
+ " Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n",
542
+ "Collecting nvidia-nvtx-cu12==12.1.105 (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
543
+ " Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
544
+ "Requirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.2.0)\n",
545
+ "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
546
+ " Using cached nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
547
+ "Collecting portalocker (from iopath->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
548
+ " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
549
+ "Collecting pypdfium2>=4.18.0 (from pdfplumber->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
550
+ " Downloading pypdfium2-4.29.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n",
551
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m76.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
552
+ "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime<1.16->unstructured-inference==0.7.25->unstructured[local-inference]) (1.3.0)\n",
553
+ "Collecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.0->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference])\n",
554
+ " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n",
555
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
556
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
557
+ "Requirement already satisfied: matplotlib>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.7.1)\n",
558
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (2.1.5)\n",
559
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.2.1)\n",
560
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (0.12.1)\n",
561
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (4.51.0)\n",
562
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (1.4.5)\n",
563
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.25->unstructured[local-inference]) (3.1.2)\n",
564
+ "Building wheels for collected packages: langdetect, iopath, antlr4-python3-runtime\n",
565
+ " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
566
+ " Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=e39cbb9b4aa1aad74d62b4cff3f3c84256c3b6b555b3762b863406e1ea056f1e\n",
567
+ " Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n",
568
+ " Building wheel for iopath (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
569
+ " Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31532 sha256=b2aa25855332a5c43eef0c3000cb3b16d8a4c9f2578bc12e110a880bc2523f92\n",
570
+ " Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d\n",
571
+ " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
572
+ " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=376b87330fe481da2030f2484d7d0ac28fbeb173911aef693e27e4bfb094c6a0\n",
573
+ " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n",
574
+ "Successfully built langdetect iopath antlr4-python3-runtime\n",
575
+ "Installing collected packages: sseclient-py, filetype, antlr4-python3-runtime, XlsxWriter, rapidfuzz, python-multipart, python-magic, python-iso639, python-docx, pypdfium2, pypdf, pypandoc, portalocker, pillow, packaging, orjson, onnx, omegaconf, olefile, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, mypy-extensions, langdetect, jsonpointer, jsonpath-python, humanfriendly, emoji, Deprecated, backoff, unstructured.pytesseract, typing-inspect, python-pptx, pytesseract, pillow_heif, pikepdf, pdf2image, nvidia-cusparse-cu12, nvidia-cudnn-cu12, msg-parser, marshmallow, jsonpatch, iopath, coloredlogs, together, pdfminer.six, onnxruntime, nvidia-cusolver-cu12, langsmith, dataclasses-json-speakeasy, dataclasses-json, unstructured-client, pdfplumber, langchain-core, unstructured, layoutparser, langchain-together, langchain-text-splitters, langchain_community, timm, effdet, unstructured-inference\n",
576
+ " Attempting uninstall: pillow\n",
577
+ " Found existing installation: Pillow 9.4.0\n",
578
+ " Uninstalling Pillow-9.4.0:\n",
579
+ " Successfully uninstalled Pillow-9.4.0\n",
580
+ " Attempting uninstall: packaging\n",
581
+ " Found existing installation: packaging 24.0\n",
582
+ " Uninstalling packaging-24.0:\n",
583
+ " Successfully uninstalled packaging-24.0\n",
584
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
585
+ "imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 10.3.0 which is incompatible.\u001b[0m\u001b[31m\n",
586
+ "\u001b[0mSuccessfully installed Deprecated-1.2.14 XlsxWriter-3.2.0 antlr4-python3-runtime-4.9.3 backoff-2.2.1 coloredlogs-15.0.1 dataclasses-json-0.6.4 dataclasses-json-speakeasy-0.5.11 effdet-0.4.1 emoji-2.11.0 filetype-1.2.0 humanfriendly-10.0 iopath-0.1.10 jsonpatch-1.33 jsonpath-python-1.0.6 jsonpointer-2.4 langchain-core-0.1.42 langchain-text-splitters-0.0.1 langchain-together-0.1.0 langchain_community-0.0.32 langdetect-1.0.9 langsmith-0.1.47 layoutparser-0.3.4 marshmallow-3.21.1 msg-parser-1.2.0 mypy-extensions-1.0.0 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.1.105 olefile-0.47 omegaconf-2.3.0 onnx-1.16.0 onnxruntime-1.15.1 orjson-3.10.0 packaging-23.2 pdf2image-1.17.0 pdfminer.six-20231228 pdfplumber-0.11.0 pikepdf-8.15.0 pillow-10.3.0 pillow_heif-0.16.0 portalocker-2.8.2 pypandoc-1.13 pypdf-4.2.0 pypdfium2-4.29.0 pytesseract-0.3.10 python-docx-1.1.0 python-iso639-2024.2.7 python-magic-0.4.27 python-multipart-0.0.9 python-pptx-0.6.23 rapidfuzz-3.8.1 sseclient-py-1.8.0 timm-0.9.16 together-0.2.11 typing-inspect-0.9.0 unstructured-0.13.2 unstructured-client-0.18.0 unstructured-inference-0.7.25 unstructured.pytesseract-0.3.12\n"
587
+ ]
588
+ },
589
+ {
590
+ "output_type": "display_data",
591
+ "data": {
592
+ "application/vnd.colab-display-data+json": {
593
+ "pip_warning": {
594
+ "packages": [
595
+ "PIL",
596
+ "pydevd_plugins"
597
+ ]
598
+ },
599
+ "id": "a05b30d70de54e25a3d0c32fffc55ab0"
600
+ }
601
+ },
602
+ "metadata": {}
603
+ }
604
+ ]
605
+ },
606
+ {
607
+ "cell_type": "code",
608
+ "source": [
609
+ "from langchain_community.document_loaders import TextLoader\n",
610
+ "from langchain_community.embeddings.fake import FakeEmbeddings\n",
611
+ "from langchain_community.vectorstores import Vectara\n",
612
+ "from langchain_text_splitters import CharacterTextSplitter"
613
+ ],
614
+ "metadata": {
615
+ "id": "bSRybIQ60tRl"
616
+ },
617
+ "execution_count": 5,
618
+ "outputs": []
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "source": [
623
+ "from google.colab import userdata\n",
624
+ "\n",
625
+ "TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')\n",
626
+ "vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')\n",
627
+ "vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')\n",
628
+ "vectara_api_key = userdata.get('VECTARA_API_KEY')"
629
+ ],
630
+ "metadata": {
631
+ "id": "d98hRDFC3WyH"
632
+ },
633
+ "execution_count": 6,
634
+ "outputs": []
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "source": [
639
+ "vectorstore = Vectara(\n",
640
+ " vectara_customer_id=vectara_customer_id,\n",
641
+ " vectara_corpus_id=vectara_corpus_id,\n",
642
+ " vectara_api_key=vectara_api_key\n",
643
+ " )"
644
+ ],
645
+ "metadata": {
646
+ "id": "n7aGHYcyzgXK"
647
+ },
648
+ "execution_count": 7,
649
+ "outputs": []
650
+ },
651
+ {
652
+ "cell_type": "code",
653
+ "source": [
654
+ "from langchain_community.document_loaders import UnstructuredPDFLoader"
655
+ ],
656
+ "metadata": {
657
+ "id": "aX5VJiU07RZs"
658
+ },
659
+ "execution_count": 8,
660
+ "outputs": []
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "source": [
665
+ "!mkdir docs\n",
666
+ "# upload sample file"
667
+ ],
668
+ "metadata": {
669
+ "id": "UQors5XgGPV7"
670
+ },
671
+ "execution_count": 37,
672
+ "outputs": []
673
+ },
674
+ {
675
+ "cell_type": "code",
676
+ "source": [
677
+ "loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')\n",
678
+ "data = loader.load()"
679
+ ],
680
+ "metadata": {
681
+ "id": "ULSBXZRcI_4R"
682
+ },
683
+ "execution_count": 9,
684
+ "outputs": []
685
+ },
686
+ {
687
+ "cell_type": "code",
688
+ "source": [
689
+ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
690
+ "docs = text_splitter.split_documents(data)"
691
+ ],
692
+ "metadata": {
693
+ "colab": {
694
+ "base_uri": "https://localhost:8080/"
695
+ },
696
+ "id": "rd_8GLJrPT5T",
697
+ "outputId": "002488bd-f8a0-4099-c4c4-b685da7a8195"
698
+ },
699
+ "execution_count": 10,
700
+ "outputs": [
701
+ {
702
+ "output_type": "stream",
703
+ "name": "stderr",
704
+ "text": [
705
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 3260, which is longer than the specified 1000\n",
706
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 1754, which is longer than the specified 1000\n",
707
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 1556, which is longer than the specified 1000\n",
708
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 2529, which is longer than the specified 1000\n",
709
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 2108, which is longer than the specified 1000\n",
710
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 1240, which is longer than the specified 1000\n",
711
+ "WARNING:langchain_text_splitters.base:Created a chunk of size 1122, which is longer than the specified 1000\n"
712
+ ]
713
+ }
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "source": [
719
+ "import json\n",
720
+ "\n",
721
+ "from langchain_community.document_transformers import DoctranPropertyExtractor\n",
722
+ "from langchain_core.documents import Document"
723
+ ],
724
+ "metadata": {
725
+ "id": "6CM6bL6JRCCA"
726
+ },
727
+ "execution_count": 3,
728
+ "outputs": []
729
+ },
730
+ {
731
+ "cell_type": "code",
732
+ "source": [
733
+ "properties = [\n",
734
+ " {\n",
735
+ " \"name\": \"document_number\",\n",
736
+ " \"description\": \"Unique identifier for the document within its project.\",\n",
737
+ " \"type\": \"string\",\n",
738
+ " \"required\": True\n",
739
+ " },\n",
740
+ " {\n",
741
+ " \"name\": \"discipline\",\n",
742
+ " \"description\": \"The discipline associated with the document.\",\n",
743
+ " \"type\": \"string\",\n",
744
+ " \"required\": True\n",
745
+ " },\n",
746
+ " {\n",
747
+ " \"name\": \"title\",\n",
748
+ " \"description\": \"Title of the document.\",\n",
749
+ " \"type\": \"string\",\n",
750
+ " \"required\": True\n",
751
+ " },\n",
752
+ " {\n",
753
+ " \"name\": \"version\",\n",
754
+ " \"description\": \"Version number of the document.\",\n",
755
+ " \"type\": \"integer\",\n",
756
+ " \"required\": True\n",
757
+ " },\n",
758
+ " {\n",
759
+ " \"name\": \"date\",\n",
760
+ " \"description\": \"Creation date of the document.\",\n",
761
+ " \"type\": \"string\",\n",
762
+ " \"format\": \"date\",\n",
763
+ " \"required\": True\n",
764
+ " },\n",
765
+ " {\n",
766
+ " \"name\": \"author\",\n",
767
+ " \"description\": \"Author of the document.\",\n",
768
+ " \"type\": \"object\",\n",
769
+ " \"properties\": {\n",
770
+ " \"name\": {\n",
771
+ " \"type\": \"string\",\n",
772
+ " \"required\": True\n",
773
+ " },\n",
774
+ " \"email\": {\n",
775
+ " \"type\": \"string\",\n",
776
+ " \"format\": \"email\",\n",
777
+ " \"required\": False\n",
778
+ " }\n",
779
+ " },\n",
780
+ " \"required\": True\n",
781
+ " },\n",
782
+ " {\n",
783
+ " \"name\": \"related_documents\",\n",
784
+ " \"description\": \"List of related documents.\",\n",
785
+ " \"type\": \"array\",\n",
786
+ " \"items\": {\n",
787
+ " \"type\": \"string\"\n",
788
+ " },\n",
789
+ " \"required\": False\n",
790
+ " },\n",
791
+ " {\n",
792
+ " \"name\": \"status\",\n",
793
+ " \"description\": \"Current status of the document.\",\n",
794
+ " \"type\": \"string\",\n",
795
+ " \"enum\": [\"draft\", \"under_review\", \"approved\", \"rejected\"],\n",
796
+ " \"required\": True\n",
797
+ " },\n",
798
+ " {\n",
799
+ " \"name\": \"keywords\",\n",
800
+ " \"description\": \"Keywords associated with the document.\",\n",
801
+ " \"type\": \"array\",\n",
802
+ " \"items\": {\n",
803
+ " \"type\": \"string\"\n",
804
+ " },\n",
805
+ " \"required\": False\n",
806
+ " },\n",
807
+ " {\n",
808
+ " \"name\": \"summary\",\n",
809
+ " \"description\": \"Short summary of the document content.\",\n",
810
+ " \"type\": \"string\",\n",
811
+ " \"required\": False\n",
812
+ " }\n",
813
+ "]"
814
+ ],
815
+ "metadata": {
816
+ "id": "9rBUSiR-bDAD"
817
+ },
818
+ "execution_count": 12,
819
+ "outputs": []
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "source": [
824
+ "property_extractor = DoctranPropertyExtractor(properties=properties)"
825
+ ],
826
+ "metadata": {
827
+ "colab": {
828
+ "base_uri": "https://localhost:8080/",
829
+ "height": 339
830
+ },
831
+ "id": "H5jIV-OYfJRg",
832
+ "outputId": "0eb7dc55-088e-4912-c85e-dc4eb87e442c"
833
+ },
834
+ "execution_count": 13,
835
+ "outputs": [
836
+ {
837
+ "output_type": "error",
838
+ "ename": "ValueError",
839
+ "evalue": "Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter.",
840
+ "traceback": [
841
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
842
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
843
+ "\u001b[0;32m<ipython-input-13-2ab54c930860>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mproperty_extractor\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDoctranPropertyExtractor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproperties\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
844
+ "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_community/document_transformers/doctran_text_extract.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, properties, openai_api_key, openai_api_model)\u001b[0m\n\u001b[1;32m 57\u001b[0m ) -> None:\n\u001b[1;32m 58\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mproperties\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproperties\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m self.openai_api_key = openai_api_key or get_from_env(\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;34m\"openai_api_key\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"OPENAI_API_KEY\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m )\n",
845
+ "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/utils/env.py\u001b[0m in \u001b[0;36mget_from_env\u001b[0;34m(key, env_key, default)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;34mf\"Did not find {key}, please add an environment variable\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;34mf\" `{env_key}` which contains it, or pass\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
846
+ "\u001b[0;31mValueError\u001b[0m: Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter."
847
+ ]
848
+ }
849
+ ]
850
+ },
851
+ {
852
+ "cell_type": "markdown",
853
+ "source": [],
854
+ "metadata": {
855
+ "id": "hVjJAK-KTTEE"
856
+ }
857
+ },
858
+ {
859
+ "cell_type": "code",
860
+ "source": [
861
+ "from dotenv import load_dotenv\n",
862
+ "\n",
863
+ "load_dotenv()"
864
+ ],
865
+ "metadata": {
866
+ "colab": {
867
+ "base_uri": "https://localhost:8080/",
868
+ "height": 349
869
+ },
870
+ "id": "HVDCIqIDRJ3Z",
871
+ "outputId": "4fbd508a-2179-4251-d2ae-2df5cce24187"
872
+ },
873
+ "execution_count": 4,
874
+ "outputs": [
875
+ {
876
+ "output_type": "error",
877
+ "ename": "ModuleNotFoundError",
878
+ "evalue": "No module named 'dotenv'",
879
+ "traceback": [
880
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
881
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
882
+ "\u001b[0;32m<ipython-input-4-c9bdfc1ba4a4>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mdotenv\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mload_dotenv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
883
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'dotenv'",
884
+ "",
885
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
886
+ ],
887
+ "errorDetails": {
888
+ "actions": [
889
+ {
890
+ "action": "open_url",
891
+ "actionText": "Open Examples",
892
+ "url": "/notebooks/snippets/importing_libraries.ipynb"
893
+ }
894
+ ]
895
+ }
896
+ }
897
+ ]
898
+ }
899
+ ]
900
+ }
langchain_vectara.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """langchain_vectara.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
8
+ """
9
+
10
+ !pip install -r requirements.txt
11
+
12
+ !pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif
13
+
14
+ from langchain_community.document_loaders import TextLoader
15
+ from langchain_community.embeddings.fake import FakeEmbeddings
16
+ from langchain_community.vectorstores import Vectara
17
+ from langchain_text_splitters import CharacterTextSplitter
18
+
19
+ from google.colab import userdata
20
+
21
+ TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
22
+ vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')
23
+ vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
24
+ vectara_api_key = userdata.get('VECTARA_API_KEY')
25
+
26
+ vectorstore = Vectara(
27
+ vectara_customer_id=vectara_customer_id,
28
+ vectara_corpus_id=vectara_corpus_id,
29
+ vectara_api_key=vectara_api_key
30
+ )
31
+
32
+ from langchain_community.document_loaders import UnstructuredPDFLoader
33
+
34
+ !mkdir docs
35
+ # upload sample file
36
+
37
+ loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
38
+ data = loader.load()
39
+
40
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
41
+ docs = text_splitter.split_documents(data)
42
+
43
+ import json
44
+
45
+ from langchain_community.document_transformers import DoctranPropertyExtractor
46
+ from langchain_core.documents import Document
47
+
48
+ properties = [
49
+ {
50
+ "name": "document_number",
51
+ "description": "Unique identifier for the document within its project.",
52
+ "type": "string",
53
+ "required": True
54
+ },
55
+ {
56
+ "name": "discipline",
57
+ "description": "The discipline associated with the document.",
58
+ "type": "string",
59
+ "required": True
60
+ },
61
+ {
62
+ "name": "title",
63
+ "description": "Title of the document.",
64
+ "type": "string",
65
+ "required": True
66
+ },
67
+ {
68
+ "name": "version",
69
+ "description": "Version number of the document.",
70
+ "type": "integer",
71
+ "required": True
72
+ },
73
+ {
74
+ "name": "date",
75
+ "description": "Creation date of the document.",
76
+ "type": "string",
77
+ "format": "date",
78
+ "required": True
79
+ },
80
+ {
81
+ "name": "author",
82
+ "description": "Author of the document.",
83
+ "type": "object",
84
+ "properties": {
85
+ "name": {
86
+ "type": "string",
87
+ "required": True
88
+ },
89
+ "email": {
90
+ "type": "string",
91
+ "format": "email",
92
+ "required": False
93
+ }
94
+ },
95
+ "required": True
96
+ },
97
+ {
98
+ "name": "related_documents",
99
+ "description": "List of related documents.",
100
+ "type": "array",
101
+ "items": {
102
+ "type": "string"
103
+ },
104
+ "required": False
105
+ },
106
+ {
107
+ "name": "status",
108
+ "description": "Current status of the document.",
109
+ "type": "string",
110
+ "enum": ["draft", "under_review", "approved", "rejected"],
111
+ "required": True
112
+ },
113
+ {
114
+ "name": "keywords",
115
+ "description": "Keywords associated with the document.",
116
+ "type": "array",
117
+ "items": {
118
+ "type": "string"
119
+ },
120
+ "required": False
121
+ },
122
+ {
123
+ "name": "summary",
124
+ "description": "Short summary of the document content.",
125
+ "type": "string",
126
+ "required": False
127
+ }
128
+ ]
129
+
130
+ property_extractor = DoctranPropertyExtractor(properties=properties)
131
+
132
+ from dotenv import load_dotenv
133
+
134
+ load_dotenv()
requirements.txt CHANGED
@@ -1 +1,9 @@
1
-
 
 
 
 
 
 
 
 
 
1
+ langchain_community
2
+ langchain-text-splitters
3
+ langchain-together
4
+ unstructured[local-inference]
5
+ pdf2image
6
+ pdfminer.six
7
+ pillow_heif
8
+ doctran
9
+ python-dotenv