# -*- coding: utf-8 -*- """langchain_vectara.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J """ !pip install -r requirements.txt !pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif from langchain_community.document_loaders import TextLoader from langchain_community.embeddings.fake import FakeEmbeddings from langchain_community.vectorstores import Vectara from langchain_text_splitters import CharacterTextSplitter from google.colab import userdata TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY') vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID') vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID') vectara_api_key = userdata.get('VECTARA_API_KEY') vectorstore = Vectara( vectara_customer_id=vectara_customer_id, vectara_corpus_id=vectara_corpus_id, vectara_api_key=vectara_api_key ) from langchain_community.document_loaders import UnstructuredPDFLoader !mkdir docs # upload sample file loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast') data = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(data) import json from langchain_community.document_transformers import DoctranPropertyExtractor from langchain_core.documents import Document properties = [ { "name": "document_number", "description": "Unique identifier for the document within its project.", "type": "string", "required": True }, { "name": "discipline", "description": "The discipline associated with the document.", "type": "string", "required": True }, { "name": "title", "description": "Title of the document.", "type": "string", "required": True }, { "name": "version", "description": "Version number of the document.", "type": "integer", "required": True }, { "name": "date", "description": "Creation date of the document.", "type": "string", "format": "date", "required": True }, { "name": "author", "description": "Author of the document.", "type": "object", "properties": { "name": { "type": "string", "required": True }, "email": { "type": "string", "format": "email", "required": False } }, "required": True }, { "name": "related_documents", "description": "List of related documents.", "type": "array", "items": { "type": "string" }, "required": False }, { "name": "status", "description": "Current status of the document.", "type": "string", "enum": ["draft", "under_review", "approved", "rejected"], "required": True }, { "name": "keywords", "description": "Keywords associated with the document.", "type": "array", "items": { "type": "string" }, "required": False }, { "name": "summary", "description": "Short summary of the document content.", "type": "string", "required": False } ] property_extractor = DoctranPropertyExtractor(properties=properties) from dotenv import load_dotenv load_dotenv()