karthik1362 commited on
Commit
c60e255
·
verified ·
1 Parent(s): c6639dd

Upload 2 files

Browse files
Files changed (2) hide show
  1. pdf_chat.ipynb +0 -0
  2. pdf_chat.py +58 -0
pdf_chat.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
pdf_chat.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """pdf chat.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1RXTs4FPcFCVb9_ZAWBBxLoYQEcKz37x9
8
+ """
9
+
10
+ !pip install langchain
11
+ !pip install unstructured # The unstructured library provides open-source components for pre-processing text documents such as PDFs, HTML and Word Documents.
12
+ !pip install openai
13
+ !pip install pybind11 # pybind11 is a lightweight header-only library that exposes C++ types in Python
14
+ !pip install chromadb # the AI-native open-source embedding database
15
+ !pip install Cython # Cython is an optimising static compiler for both the Python programming language
16
+ !pip3 install "git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI" # COCO is a large image dataset designed for object detection, segmentation, person keypoints detection, stuff segmentation, and caption generation
17
+ !pip install unstructured[local-inference]
18
+ !CC=clang CXX=clang++ ARCHFLAGS="-arch x86_64" pip install 'git+https://github.com/facebookresearch/detectron2.git' # Detectron2 is Facebook AI Research's next generation library that provides state-of-the-art detection and segmentation algorithms.
19
+ !pip install layoutparser[layoutmodels,tesseract] # A Unified Toolkit for Deep Learning Based Document Image Analysis
20
+ !pip install pytesseract # Python-tesseract is an optical character recognition (OCR) tool for python.
21
+ !pip install Pillow==9.0.0 # The Python Imaging Library adds image processing capabilities to your Python interpreter. Need this version, otherwise errors occur.
22
+ !pip install tiktoken
23
+ !pip install --upgrade Pillow
24
+
25
+ import os
26
+ os.environ['OPENAI_API_KEY'] = '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'
27
+
28
+ from langchain.document_loaders import UnstructuredPDFLoader
29
+ from langchain.indexes import VectorstoreIndexCreator
30
+
31
+ from detectron2.config import get_cfg
32
+ cfg = get_cfg()
33
+ cfg.MODEL.DEVICE = 'gpu' #GPU is recommended
34
+
35
+ !wget https://pgcag.files.wordpress.com/2010/01/48lawsofpower.pdf #meta earnings; replace with any pdf
36
+
37
+ !mkdir docs
38
+ !mv 48lawsofpower.pdf docs
39
+
40
+ text_folder = 'docs'
41
+ loaders = [UnstructuredPDFLoader(os.path.join(text_folder, fn)) for fn in os.listdir(text_folder)]
42
+
43
+ !apt-get install poppler-utils # error occurs without this, pdf rendering library
44
+
45
+ index = VectorstoreIndexCreator().from_loaders(loaders)
46
+
47
+ query = "Can you give me an example from history where the enemy was crushed totally from the book?"
48
+ index.query(query)
49
+
50
+ query = "What's the point of making myself less accessible?"
51
+ index.query(query)
52
+
53
+ query = "Can you tell me the story of Queen Elizabeth I from this 48 laws of power book?"
54
+ index.query(query)
55
+
56
+ query = "State the names of 5 laws?"
57
+ index.query(query)
58
+