docverifyrag / langchain_vectara.py
Carlos Salgado
add working langhchain drafts and requirements file
0c2a143
raw
history blame
3.69 kB
# -*- coding: utf-8 -*-
"""langchain_vectara.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
"""
!pip install -r requirements.txt
!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_text_splitters import CharacterTextSplitter
from google.colab import userdata
TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')
vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
vectara_api_key = userdata.get('VECTARA_API_KEY')
vectorstore = Vectara(
vectara_customer_id=vectara_customer_id,
vectara_corpus_id=vectara_corpus_id,
vectara_api_key=vectara_api_key
)
from langchain_community.document_loaders import UnstructuredPDFLoader
!mkdir docs
# upload sample file
loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
import json
from langchain_community.document_transformers import DoctranPropertyExtractor
from langchain_core.documents import Document
properties = [
{
"name": "document_number",
"description": "Unique identifier for the document within its project.",
"type": "string",
"required": True
},
{
"name": "discipline",
"description": "The discipline associated with the document.",
"type": "string",
"required": True
},
{
"name": "title",
"description": "Title of the document.",
"type": "string",
"required": True
},
{
"name": "version",
"description": "Version number of the document.",
"type": "integer",
"required": True
},
{
"name": "date",
"description": "Creation date of the document.",
"type": "string",
"format": "date",
"required": True
},
{
"name": "author",
"description": "Author of the document.",
"type": "object",
"properties": {
"name": {
"type": "string",
"required": True
},
"email": {
"type": "string",
"format": "email",
"required": False
}
},
"required": True
},
{
"name": "related_documents",
"description": "List of related documents.",
"type": "array",
"items": {
"type": "string"
},
"required": False
},
{
"name": "status",
"description": "Current status of the document.",
"type": "string",
"enum": ["draft", "under_review", "approved", "rejected"],
"required": True
},
{
"name": "keywords",
"description": "Keywords associated with the document.",
"type": "array",
"items": {
"type": "string"
},
"required": False
},
{
"name": "summary",
"description": "Short summary of the document content.",
"type": "string",
"required": False
}
]
property_extractor = DoctranPropertyExtractor(properties=properties)
from dotenv import load_dotenv
load_dotenv()