raghuram13 skeitel commited on
Commit
0f4521b
0 Parent(s):

Duplicate from skeitel/pdf_to_langchain_ai_gradio

Browse files

Co-authored-by: Skeitel <skeitel@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +13 -0
  3. app.py +98 -0
  4. requirements.txt +0 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pdf To Langchain Ai Gradio
3
+ emoji: 📉
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: skeitel/pdf_to_langchain_ai_gradio
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #GRADIO INTERFACE TO CONVERT A PDF TO TEXT AND READ IT WITH LANGCHAIN AND OPEN AI ###################################
2
+ import gradio as gr
3
+ import PyPDF2, os, sys, random, time, shutil
4
+ from pypdf import PdfReader
5
+ from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain import OpenAI
8
+ import openai
9
+
10
+ directory_path = '/converted_pdf_to_text'
11
+
12
+
13
+ def extract_info(pdf_file):
14
+
15
+ #BEGINS PDF TO TEXT SECTION ###################
16
+ if pdf_file.name.lower().endswith('.pdf'):
17
+ reader = PdfReader(pdf_file.name)
18
+ pages = reader.pages
19
+ extracted_text = [i.extract_text() for i in pages]
20
+
21
+ #WRITING TEXT FILE TO FOLDER ##############
22
+ directory_name = 'converted_pdf_to_text'
23
+ if not os.path.exists(directory_name):
24
+ os.mkdir(directory_name)
25
+ file_name = 'document_in_txt_format.txt'
26
+ file_path = os.path.join(directory_name, file_name)
27
+ with open(file_path, 'w', encoding = 'UTF-8') as f:
28
+ f.write(str(extracted_text))
29
+ if os.path.isfile(file_path):
30
+ print(f'{file_name} created successfully in {directory_name}.')
31
+ else:
32
+ print(f"{file_name} creation in {directory_name} failed.")
33
+
34
+ #BEGINS LLM SECTION ##########
35
+ max_input_size = 4096
36
+ num_outputs = 500
37
+ max_chunk_overlap = 200
38
+ chunk_size_limit = 4000
39
+
40
+ llm_predictor = LLMPredictor(llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', max_tokens=num_outputs))
41
+ prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
42
+
43
+ documents = SimpleDirectoryReader(directory_name).load_data()
44
+ global index
45
+ index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, prompt_helper=prompt_helper)
46
+
47
+ #Remove json file if it exists to make sure it's not using a previous index file as source
48
+ if os.path.exists("index.json"):
49
+ os.remove("index.json")
50
+ print("The file 'index.json' has been deleted.")
51
+ else:
52
+ print("The file 'index.json' does not exist.")
53
+
54
+
55
+ #Save json index to disk from current information
56
+ index.save_to_disk('index.json')
57
+
58
+ #Remove directory with initial text file
59
+ #shutil.rmtree(directory_name)
60
+ return ("Success! You can now click on the 'Knowledge bot' tab to interact with your document")
61
+
62
+
63
+ def chat(chat_history, user_input):
64
+
65
+ bot_response = index.query(user_input)
66
+ response = ''
67
+ #Show each letter progressively
68
+ for letter in ''.join(bot_response.response):
69
+ response += letter + ""
70
+ yield chat_history + [(user_input, response)]
71
+
72
+
73
+
74
+ messages = [{"role": "system", "content": """You are a helpful assistant. You help the reader understand documents paraphrasing, quoting and summarizing information. You follow the instructions of the user at all times"""}]
75
+
76
+ openai.api_key = os.getenv("OPENAI_API_KEY")
77
+
78
+
79
+ with gr.Blocks() as demo:
80
+
81
+ gr.Markdown('Q&A bot for PDF docs. Upload your document, press the button and wait for confirmation of success')
82
+
83
+ with gr.Tab('Input PDF document here'):
84
+ text_input = gr.File()
85
+ text_output = gr.Textbox()
86
+ text_button = gr.Button('Build the bot!')
87
+ text_button.click(extract_info, text_input, text_output)
88
+ with gr.Tab('Knowledge bot'):
89
+ chatbot = gr.Chatbot()
90
+ message = gr.Textbox(label = 'Ask here your question about the document, then press "enter" and scroll up for response')
91
+ message.submit(chat, [chatbot, message], chatbot)
92
+
93
+
94
+ demo.queue().launch(debug = True)
95
+
96
+
97
+ if __name__ == "__main__":
98
+ demo.launch()
requirements.txt ADDED
Binary file (1.75 kB). View file