Spaces:

bacancydataprophets
/

MeDocChat

Running

App Files Files Community

akash015 commited on Aug 4, 2024

Commit

57a878a

•

1 Parent(s): ba101b8

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -6

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import re
 import PyPDF2
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -22,7 +22,7 @@ llm_groq = ChatGroq(
     )
 # Initialize anonymizer
-anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL'], faker_seed=18)
 def extract_text_from_pdf(file_path):
     pdf = PyPDF2.PdfReader(file_path)
@@ -125,6 +125,27 @@ async def on_chat_start():
         pdf_text
     )
     embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
     docsearch = await cl.make_async(Chroma.from_texts)(
@@ -170,12 +191,9 @@ async def main(message: cl.Message):
     # Call the chain with user's message content
     res = await chain.ainvoke(message.content, callbacks=[cb])
     answer = anonymizer.deanonymize(
-        res["answer"]
     )
     text_elements = []
     # Return results
     await cl.Message(content=answer, elements=text_elements).send()

+import re
 import PyPDF2
 from langchain_community.embeddings import OllamaEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
     )
 # Initialize anonymizer
+anonymizer = PresidioReversibleAnonymizer(analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'DATE_TIME', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN'], faker_seed=18)
 def extract_text_from_pdf(file_path):
     pdf = PyPDF2.PdfReader(file_path)
         pdf_text
     )
+    # with splitting into chunks
+    # {
+    # # Split the sanitized text into chunks
+    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    # texts = text_splitter.split_text(anonymized_text)
+    # # Create metadata for each chunk
+    # metadatas = [{"source": f"{i}-pl"} for i in range(len(texts))]
+    # # Create a Chroma vector store
+    # embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    # docsearch = await cl.make_async(Chroma.from_texts)(
+    #     texts, embeddings, metadatas=metadatas
+    # )
+    # }
+    # without splitting into chunks
+    # {
+    # Create a Chroma vector store
+    # embeddings = OllamaEmbeddings(model="nomic-embed-text")
     embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
     docsearch = await cl.make_async(Chroma.from_texts)(
     # Call the chain with user's message content
     res = await chain.ainvoke(message.content, callbacks=[cb])
     answer = anonymizer.deanonymize(
+        "ok"+res["answer"]
     )
     text_elements = []
     # Return results
     await cl.Message(content=answer, elements=text_elements).send()