akash015 commited on
Commit
57a878a
1 Parent(s): ba101b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -6
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import re
2
  import PyPDF2
3
  from langchain_community.embeddings import OllamaEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -22,7 +22,7 @@ llm_groq = ChatGroq(
22
  )
23
 
24
  # Initialize anonymizer
25
- anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
26
 
27
  def extract_text_from_pdf(file_path):
28
  pdf = PyPDF2.PdfReader(file_path)
@@ -125,6 +125,27 @@ async def on_chat_start():
125
  pdf_text
126
  )
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
129
 
130
  docsearch = await cl.make_async(Chroma.from_texts)(
@@ -170,12 +191,9 @@ async def main(message: cl.Message):
170
  # Call the chain with user's message content
171
  res = await chain.ainvoke(message.content, callbacks=[cb])
172
  answer = anonymizer.deanonymize(
173
- res["answer"]
174
  )
175
  text_elements = []
176
 
177
  # Return results
178
  await cl.Message(content=answer, elements=text_elements).send()
179
-
180
-
181
-
 
1
+ import re
2
  import PyPDF2
3
  from langchain_community.embeddings import OllamaEmbeddings
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
22
  )
23
 
24
  # Initialize anonymizer
25
+ anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
26
 
27
  def extract_text_from_pdf(file_path):
28
  pdf = PyPDF2.PdfReader(file_path)
 
125
  pdf_text
126
  )
127
 
128
+ # with splitting into chunks
129
+ # {
130
+ # # Split the sanitized text into chunks
131
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
132
+ # texts = text_splitter.split_text(anonymized_text)
133
+
134
+ # # Create metadata for each chunk
135
+ # metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
136
+
137
+ # # Create a Chroma vector store
138
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
139
+ # docsearch = await cl.make_async(Chroma.from_texts)(
140
+ # texts, embeddings, metadatas=metadatas
141
+ # )
142
+ # }
143
+
144
+ # without splitting into chunks
145
+ # {
146
+ # Create a Chroma vector store
147
+
148
+ # embeddings = OllamaEmbeddings(model="nomic-embed-text")
149
  embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
150
 
151
  docsearch = await cl.make_async(Chroma.from_texts)(
 
191
  # Call the chain with user's message content
192
  res = await chain.ainvoke(message.content, callbacks=[cb])
193
  answer = anonymizer.deanonymize(
194
+ "ok"+res["answer"]
195
  )
196
  text_elements = []
197
 
198
  # Return results
199
  await cl.Message(content=answer, elements=text_elements).send()