Spaces:

salgadev
/

docverifyrag

Sleeping

App Files Files Community

Carlos Salgado commited on Apr 19, 2024

Commit

e39bb0b

1 Parent(s): c40d04b

fallback on pypdf, trim flake, minor ux

Browse files

Files changed (4) hide show

app.py +15 -15
flake.nix +4 -18
requirements.txt +2 -2
scripts.py +33 -29

app.py CHANGED Viewed

@@ -13,17 +13,17 @@ def suggest_metadata(file_upload):
     with tempfile.NamedTemporaryFile(delete=False) as tmp:
         tmp.write(uploaded_file.read())
-        file_path = f'{tmp.name}.{extension}'
-        st.write(f'Created temporary file {file_path}')
-    st.write('## Processing file with Unstructured')
-    docs = ingest(file_path)
-    metadata = generate_metadata(docs)
     st.write('## Querying Together.ai API')
-    form = st.form(key='generate_form')
-    st.write(f'## Suggested Metadata Generated by {MODEL_NAME}')
-    st.write(f'### {metadata}')
 with st.form('analyze_form'):
     st.write('Enter your file metadata in the following schema:')
@@ -38,14 +38,14 @@ with st.form('analyze_form'):
         analysis = analyze_metadata(filename, description, discipline)
         st.write(analysis)
 st.write('## Generate metadata?')
-uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
-if uploaded_file is not None:
-    suggest_metadata(uploaded_file)
-    delete_file_button = form.form_submit_button(label='Delete file')
-    if delete_file_button:
-        os.remove(file_path)

     with tempfile.NamedTemporaryFile(delete=False) as tmp:
         tmp.write(uploaded_file.read())
+        st.write(f'Created temporary file {tmp.name}')
+    st.write('## Ingesting Unstructured file')
+    docs = ingest(tmp.name)
+    print(f'Ingested {tmp.name}')
+    metadata = generate_metadata(docs)
     st.write('## Querying Together.ai API')
+    st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')
+    st.write(f'#### {metadata}')
 with st.form('analyze_form'):
     st.write('Enter your file metadata in the following schema:')
         analysis = analyze_metadata(filename, description, discipline)
         st.write(analysis)
+        submitted = None
 st.write('## Generate metadata?')
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+if uploaded_file is not None:
+    query_api = st.button('Query API')
+    if query_api:
+        suggest_metadata(uploaded_file)
+        query_api = None

flake.nix CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   description = "A LLM backend development flake powered by unstructured and langchain";
   inputs = {
     nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
   };
@@ -9,6 +9,7 @@
     system = "x86_64-linux";
     #       ↑ Swap it for your system if needed
     #       "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
     pkgs = nixpkgs.legacyPackages.${system};
   in {
     devShells.${system}.default = pkgs.mkShell {
@@ -17,33 +18,18 @@
           python-pkgs.pip # VsCode starts
           python-pkgs.jupyter
           python-pkgs.notebook # VsCode ends
-          python-pkgs.numpy
           python-pkgs.pandas
-          python-pkgs.scipy
-          python-pkgs.matplotlib
           python-pkgs.requests
           python-pkgs.langchain-community
           python-pkgs.langchain
           python-pkgs.langchain-text-splitters
-          python-pkgs.unstructured
-          python-pkgs.wrapt # unstructured[local-inference] starts
-          python-pkgs.iso-639
-          python-pkgs.emoji
-          python-pkgs.pillow-heif
-          python-pkgs.magic
-          python-pkgs.poppler-qt5
-          python-pkgs.pytesseract
-          python-pkgs.langdetect # unstructured[local-inference] ends
           python-pkgs.openai
-          python-pkgs.pydantic
           python-pkgs.python-dotenv
           python-pkgs.configargparse
           python-pkgs.streamlit
-          python-pkgs.lark
           python-pkgs.sentence-transformers
-          pkgs.unstructured-api
-          pkgs.poppler
-          pkgs.haskellPackages.iso639
         ]))
       ];

 {
   description = "A LLM backend development flake powered by unstructured and langchain";
   inputs = {
     nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
   };
     system = "x86_64-linux";
     #       ↑ Swap it for your system if needed
     #       "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
+    debug = true;
     pkgs = nixpkgs.legacyPackages.${system};
   in {
     devShells.${system}.default = pkgs.mkShell {
           python-pkgs.pip # VsCode starts
           python-pkgs.jupyter
           python-pkgs.notebook # VsCode ends
           python-pkgs.pandas
           python-pkgs.requests
           python-pkgs.langchain-community
           python-pkgs.langchain
           python-pkgs.langchain-text-splitters
+          python-pkgs.pypdf
           python-pkgs.openai
           python-pkgs.python-dotenv
           python-pkgs.configargparse
           python-pkgs.streamlit
           python-pkgs.sentence-transformers
+          python-pkgs.unstructured
         ]))
       ];

requirements.txt CHANGED Viewed

@@ -7,5 +7,5 @@ streamlit
 python-dotenv
 sentence-transformers
 iso639-lang
-poppler
-unstructured[all-docs]

 python-dotenv
 sentence-transformers
 iso639-lang
+unstructured[pdf]
+pypdf

scripts.py CHANGED Viewed

@@ -5,8 +5,11 @@ import json
 import openai
 import sys
 from dotenv import load_dotenv
 from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Vectara
 from langchain_core.output_parsers import StrOutputParser
@@ -56,35 +59,35 @@ def get_sources(documents):
 def get_summary(documents):
     return documents[-1].page_content
-def ingest(file_path):
-    extension = os.path.splitext(file_path)[1].lower()
-    if extension == '.pdf':
         loader = UnstructuredPDFLoader(file_path)
-    elif extension == '.txt':
-        loader = TextLoader(file_path)
-    else:
-        raise NotImplementedError('Only .txt or .pdf files are supported')
-    # transform locally
-    documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
-    separators=[
-        "\n\n",
-        "\n",
-        " ",
-        ",",
-        "\uff0c",  # Fullwidth comma
-        "\u3001",  # Ideographic comma
-        "\uff0e",  # Fullwidth full stop
-        # "\u200B",  # Zero-width space (Asian languages)
-        # "\u3002",  # Ideographic full stop (Asian languages)
-        "",
-    ])
-    docs = text_splitter.split_documents(documents)
-    return docs
 def generate_metadata(docs):
@@ -126,8 +129,9 @@ def generate_metadata(docs):
             }
         ]
     )
-    return json.loads(chat_completion.choices[0].message.content)
 def analyze_metadata(filename, description, discipline):

 import openai
 import sys
 from dotenv import load_dotenv
 from langchain_community.document_loaders import TextLoader
+from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Vectara
 from langchain_core.output_parsers import StrOutputParser
 def get_summary(documents):
     return documents[-1].page_content
+def ingest(file_path):
+    try:
+        loader = PyPDFLoader(file_path)
+        documents = loader.load()
+        print('Loaded PyPDFLoader')
+    except Exception as e:
+        print(f'{e}')
         loader = UnstructuredPDFLoader(file_path)
+        documents = loader.load()
+        print('Loaded UnstructuredPDFLoader')
+    finally:
+        # transform locally
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
+        separators=[
+            "\n\n",
+            "\n",
+            " ",
+            ",",
+            "\uff0c",  # Fullwidth comma
+            "\u3001",  # Ideographic comma
+            "\uff0e",  # Fullwidth full stop
+            # "\u200B",  # Zero-width space (Asian languages)
+            # "\u3002",  # Ideographic full stop (Asian languages)
+            "",
+        ])
+        docs = text_splitter.split_documents(documents)
+        return docs
 def generate_metadata(docs):
             }
         ]
     )
+    return chat_completion.choices[0].message.content
+    #return json.loads(chat_completion.choices[0].message.content)
 def analyze_metadata(filename, description, discipline):