Spaces:

AIhackathons
/

docverifyrag

Runtime error

docverifyrag / langchain_vectara.py

Carlos Salgado

add working langhchain drafts and requirements file

0c2a143 5 months ago

3.69 kB

	# -- coding: utf-8 --
	"""langchain_vectara.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1XzD7XHO_a-gYFBnGUWH1MOtstT4sDY3J
	"""

	!pip install -r requirements.txt

	!pip install langchain_community langchain-text-splitters unstructured[local-inference] pdf2image pdfminer.six langchain-together pillow_heif

	from langchain_community.document_loaders import TextLoader
	from langchain_community.embeddings.fake import FakeEmbeddings
	from langchain_community.vectorstores import Vectara
	from langchain_text_splitters import CharacterTextSplitter

	from google.colab import userdata

	TOGETHER_API_KEY = userdata.get('TOGETHER_API_KEY')
	vectara_customer_id = userdata.get('VECTARA_CUSTOMER_ID')
	vectara_corpus_id = userdata.get('VECTARA_CORPUS_ID')
	vectara_api_key = userdata.get('VECTARA_API_KEY')

	vectorstore = Vectara(
	vectara_customer_id=vectara_customer_id,
	vectara_corpus_id=vectara_corpus_id,
	vectara_api_key=vectara_api_key
	)

	from langchain_community.document_loaders import UnstructuredPDFLoader

	!mkdir docs
	# upload sample file

	loader = UnstructuredPDFLoader('ISB-020-U3-W-S-01-B18003-001-020.pdf', strategy='fast')
	data = loader.load()

	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	docs = text_splitter.split_documents(data)

	import json

	from langchain_community.document_transformers import DoctranPropertyExtractor
	from langchain_core.documents import Document

	properties = [
	{
	"name": "document_number",
	"description": "Unique identifier for the document within its project.",
	"type": "string",
	"required": True
	},
	{
	"name": "discipline",
	"description": "The discipline associated with the document.",
	"type": "string",
	"required": True
	},
	{
	"name": "title",
	"description": "Title of the document.",
	"type": "string",
	"required": True
	},
	{
	"name": "version",
	"description": "Version number of the document.",
	"type": "integer",
	"required": True
	},
	{
	"name": "date",
	"description": "Creation date of the document.",
	"type": "string",
	"format": "date",
	"required": True
	},
	{
	"name": "author",
	"description": "Author of the document.",
	"type": "object",
	"properties": {
	"name": {
	"type": "string",
	"required": True
	},
	"email": {
	"type": "string",
	"format": "email",
	"required": False
	}
	},
	"required": True
	},
	{
	"name": "related_documents",
	"description": "List of related documents.",
	"type": "array",
	"items": {
	"type": "string"
	},
	"required": False
	},
	{
	"name": "status",
	"description": "Current status of the document.",
	"type": "string",
	"enum": ["draft", "under_review", "approved", "rejected"],
	"required": True
	},
	{
	"name": "keywords",
	"description": "Keywords associated with the document.",
	"type": "array",
	"items": {
	"type": "string"
	},
	"required": False
	},
	{
	"name": "summary",
	"description": "Short summary of the document content.",
	"type": "string",
	"required": False
	}
	]

	property_extractor = DoctranPropertyExtractor(properties=properties)

	from dotenv import load_dotenv

	load_dotenv()