File size: 2,737 Bytes
c60e255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# -*- coding: utf-8 -*-
"""pdf chat.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1RXTs4FPcFCVb9_ZAWBBxLoYQEcKz37x9
"""
!pip install langchain
!pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents.
!pip install openai
!pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
!pip install chromadb # the AI-native open-source embedding database
!pip install Cython # Cython is an optimising static compiler for both the Python programming language
!pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
!pip install unstructured[local-inference]
!CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
!pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
!pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
!pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur.
!pip install tiktoken
!pip install --upgrade Pillow
import os
os.environ['OPENAI_API_KEY'] = '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'
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from detectron2.config import get_cfg
cfg = get_cfg()
cfg.MODEL.DEVICE = 'gpu' #GPU is recommended
!wget https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf #meta earnings; replace with any pdf
!mkdir docs
!mv 48lawsofpower.pdf docs
text_folder = 'docs'
loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]
!apt-get install poppler-utils # error occurs without this, pdf rendering library
index = VectorstoreIndexCreator().from_loaders(loaders)
query = "Can you give me an example from history where the enemy was crushed totally from the book?"
index.query(query)
query = "What's the point of making myself less accessible?"
index.query(query)
query = "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"
index.query(query)
query = "State the names of 5 laws?"
index.query(query)
|