# -*- coding: utf-8 -*- """pdf chat.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1RXTs4FPcFCVb9_ZAWBBxLoYQEcKz37x9 """ !pip install langchain !pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents. !pip install openai !pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python !pip install chromadb # the AI-native open-source embedding database !pip install Cython # Cython is an optimising static compiler for both the Python programming language !pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation !pip install unstructured[local-inference] !CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms. !pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis !pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python. !pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur. !pip install tiktoken !pip install --upgrade Pillow import os os.environ['OPENAI_API_KEY'] = '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' from langchain.document_loaders import UnstructuredPDFLoader from langchain.indexes import VectorstoreIndexCreator from detectron2.config import get_cfg cfg = get_cfg() cfg.MODEL.DEVICE = 'gpu' #GPU is recommended !wget https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf #meta earnings; replace with any pdf !mkdir docs !mv 48lawsofpower.pdf docs text_folder = 'docs' loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)] !apt-get install poppler-utils # error occurs without this, pdf rendering library index = VectorstoreIndexCreator().from_loaders(loaders) query = "Can you give me an example from history where the enemy was crushed totally from the book?" index.query(query) query = "What's the point of making myself less accessible?" index.query(query) query = "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?" index.query(query) query = "State the names of 5 laws?" index.query(query)