acpotts commited on
Commit
2859653
1 Parent(s): 4eb274b

add pdf capability

Browse files
Files changed (2) hide show
  1. app.py +18 -7
  2. requirements.txt +3 -1
app.py CHANGED
@@ -10,7 +10,11 @@ from aimakerspace.openai_utils.prompts import (
10
  from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
 
 
13
  import chainlit as cl
 
 
14
 
15
  system_template = """\
16
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
@@ -47,24 +51,31 @@ class RetrievalAugmentedQAPipeline:
47
 
48
  return {"response": generate_response(), "context": context_list}
49
 
50
- text_splitter = CharacterTextSplitter()
51
-
52
 
53
  def process_text_file(file: AskFileResponse):
54
- import tempfile
55
 
56
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
57
  temp_file_path = temp_file.name
58
 
59
  with open(temp_file_path, "wb") as f:
60
  f.write(file.content)
61
 
62
- text_loader = TextFileLoader(temp_file_path)
63
- documents = text_loader.load_documents()
64
- texts = text_splitter.split_texts(documents)
 
 
 
 
 
 
65
  return texts
66
 
67
 
 
68
  @cl.on_chat_start
69
  async def on_chat_start():
70
  files = None
 
10
  from aimakerspace.openai_utils.embedding import EmbeddingModel
11
  from aimakerspace.vectordatabase import VectorDatabase
12
  from aimakerspace.openai_utils.chatmodel import ChatOpenAI
13
+ from langchain_experimental.text_splitter import SemanticChunker
14
+ from langchain_openai.embeddings import OpenAIEmbeddings
15
  import chainlit as cl
16
+ import tempfile
17
+ from langchain_community.document_loaders.pdf import PyPDFLoader
18
 
19
  system_template = """\
20
  Use the following context to answer a users question. If you cannot find the answer in the context, say you don't know the answer."""
 
51
 
52
  return {"response": generate_response(), "context": context_list}
53
 
54
+ # text_splitter = CharacterTextSplitter()
55
+ text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="standard_deviation")
56
 
57
  def process_text_file(file: AskFileResponse):
58
+
59
 
60
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=file.name) as temp_file:
61
  temp_file_path = temp_file.name
62
 
63
  with open(temp_file_path, "wb") as f:
64
  f.write(file.content)
65
 
66
+ if file.type == 'text/plain':
67
+ text_loader = TextFileLoader(temp_file_path)
68
+ documents = text_loader.load_documents()
69
+ elif file.type == 'application/pdf':
70
+ pdf_loader = PyPDFLoader(temp_file_path)
71
+ documents = pdf_loader.load()
72
+ else:
73
+ raise ValueError("Provide a .txt or .pdf file")
74
+ texts = [x.page_content for x in text_splitter.transform_documents(documents)]
75
  return texts
76
 
77
 
78
+
79
  @cl.on_chat_start
80
  async def on_chat_start():
81
  files = None
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  numpy
2
  chainlit==0.7.700
3
- openai
 
 
 
1
  numpy
2
  chainlit==0.7.700
3
+ openai
4
+ langchain_experimental
5
+ langchain_openai