Carlos Salgado commited on
Commit
e39bb0b
1 Parent(s): c40d04b

fallback on pypdf, trim flake, minor ux

Browse files
Files changed (4) hide show
  1. app.py +15 -15
  2. flake.nix +4 -18
  3. requirements.txt +2 -2
  4. scripts.py +33 -29
app.py CHANGED
@@ -13,17 +13,17 @@ def suggest_metadata(file_upload):
13
 
14
  with tempfile.NamedTemporaryFile(delete=False) as tmp:
15
  tmp.write(uploaded_file.read())
16
- file_path = f'{tmp.name}.{extension}'
17
- st.write(f'Created temporary file {file_path}')
18
 
19
- st.write('## Processing file with Unstructured')
20
- docs = ingest(file_path)
21
- metadata = generate_metadata(docs)
 
22
 
 
23
  st.write('## Querying Together.ai API')
24
- form = st.form(key='generate_form')
25
- st.write(f'## Suggested Metadata Generated by {MODEL_NAME}')
26
- st.write(f'### {metadata}')
27
 
28
  with st.form('analyze_form'):
29
  st.write('Enter your file metadata in the following schema:')
@@ -38,14 +38,14 @@ with st.form('analyze_form'):
38
  analysis = analyze_metadata(filename, description, discipline)
39
 
40
  st.write(analysis)
 
41
 
42
  st.write('## Generate metadata?')
43
- uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
44
 
45
- if uploaded_file is not None:
46
 
47
- suggest_metadata(uploaded_file)
48
-
49
- delete_file_button = form.form_submit_button(label='Delete file')
50
- if delete_file_button:
51
- os.remove(file_path)
 
13
 
14
  with tempfile.NamedTemporaryFile(delete=False) as tmp:
15
  tmp.write(uploaded_file.read())
16
+ st.write(f'Created temporary file {tmp.name}')
 
17
 
18
+ st.write('## Ingesting Unstructured file')
19
+
20
+ docs = ingest(tmp.name)
21
+ print(f'Ingested {tmp.name}')
22
 
23
+ metadata = generate_metadata(docs)
24
  st.write('## Querying Together.ai API')
25
+ st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')
26
+ st.write(f'#### {metadata}')
 
27
 
28
  with st.form('analyze_form'):
29
  st.write('Enter your file metadata in the following schema:')
 
38
  analysis = analyze_metadata(filename, description, discipline)
39
 
40
  st.write(analysis)
41
+ submitted = None
42
 
43
  st.write('## Generate metadata?')
44
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
45
 
46
+ if uploaded_file is not None:
47
 
48
+ query_api = st.button('Query API')
49
+ if query_api:
50
+ suggest_metadata(uploaded_file)
51
+ query_api = None
 
flake.nix CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  description = "A LLM backend development flake powered by unstructured and langchain";
3
-
4
  inputs = {
5
  nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
6
  };
@@ -9,6 +9,7 @@
9
  system = "x86_64-linux";
10
  # ↑ Swap it for your system if needed
11
  # "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
 
12
  pkgs = nixpkgs.legacyPackages.${system};
13
  in {
14
  devShells.${system}.default = pkgs.mkShell {
@@ -17,33 +18,18 @@
17
  python-pkgs.pip # VsCode starts
18
  python-pkgs.jupyter
19
  python-pkgs.notebook # VsCode ends
20
- python-pkgs.numpy
21
  python-pkgs.pandas
22
- python-pkgs.scipy
23
- python-pkgs.matplotlib
24
  python-pkgs.requests
25
  python-pkgs.langchain-community
26
  python-pkgs.langchain
27
  python-pkgs.langchain-text-splitters
28
- python-pkgs.unstructured
29
- python-pkgs.wrapt # unstructured[local-inference] starts
30
- python-pkgs.iso-639
31
- python-pkgs.emoji
32
- python-pkgs.pillow-heif
33
- python-pkgs.magic
34
- python-pkgs.poppler-qt5
35
- python-pkgs.pytesseract
36
- python-pkgs.langdetect # unstructured[local-inference] ends
37
  python-pkgs.openai
38
- python-pkgs.pydantic
39
  python-pkgs.python-dotenv
40
  python-pkgs.configargparse
41
  python-pkgs.streamlit
42
- python-pkgs.lark
43
  python-pkgs.sentence-transformers
44
- pkgs.unstructured-api
45
- pkgs.poppler
46
- pkgs.haskellPackages.iso639
47
  ]))
48
  ];
49
 
 
1
  {
2
  description = "A LLM backend development flake powered by unstructured and langchain";
3
+
4
  inputs = {
5
  nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
6
  };
 
9
  system = "x86_64-linux";
10
  # ↑ Swap it for your system if needed
11
  # "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
12
+ debug = true;
13
  pkgs = nixpkgs.legacyPackages.${system};
14
  in {
15
  devShells.${system}.default = pkgs.mkShell {
 
18
  python-pkgs.pip # VsCode starts
19
  python-pkgs.jupyter
20
  python-pkgs.notebook # VsCode ends
 
21
  python-pkgs.pandas
 
 
22
  python-pkgs.requests
23
  python-pkgs.langchain-community
24
  python-pkgs.langchain
25
  python-pkgs.langchain-text-splitters
26
+ python-pkgs.pypdf
 
 
 
 
 
 
 
 
27
  python-pkgs.openai
 
28
  python-pkgs.python-dotenv
29
  python-pkgs.configargparse
30
  python-pkgs.streamlit
 
31
  python-pkgs.sentence-transformers
32
+ python-pkgs.unstructured
 
 
33
  ]))
34
  ];
35
 
requirements.txt CHANGED
@@ -7,5 +7,5 @@ streamlit
7
  python-dotenv
8
  sentence-transformers
9
  iso639-lang
10
- poppler
11
- unstructured[all-docs]
 
7
  python-dotenv
8
  sentence-transformers
9
  iso639-lang
10
+ unstructured[pdf]
11
+ pypdf
scripts.py CHANGED
@@ -5,8 +5,11 @@ import json
5
  import openai
6
  import sys
7
  from dotenv import load_dotenv
 
8
  from langchain_community.document_loaders import TextLoader
 
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
 
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import Vectara
12
  from langchain_core.output_parsers import StrOutputParser
@@ -56,35 +59,35 @@ def get_sources(documents):
56
  def get_summary(documents):
57
  return documents[-1].page_content
58
 
59
- def ingest(file_path):
60
- extension = os.path.splitext(file_path)[1].lower()
61
-
62
- if extension == '.pdf':
 
 
 
63
  loader = UnstructuredPDFLoader(file_path)
64
- elif extension == '.txt':
65
- loader = TextLoader(file_path)
66
- else:
67
- raise NotImplementedError('Only .txt or .pdf files are supported')
68
-
69
- # transform locally
70
- documents = loader.load()
71
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
72
- separators=[
73
- "\n\n",
74
- "\n",
75
- " ",
76
- ",",
77
- "\uff0c", # Fullwidth comma
78
- "\u3001", # Ideographic comma
79
- "\uff0e", # Fullwidth full stop
80
- # "\u200B", # Zero-width space (Asian languages)
81
- # "\u3002", # Ideographic full stop (Asian languages)
82
- "",
83
- ])
84
- docs = text_splitter.split_documents(documents)
85
-
86
- return docs
87
-
88
 
89
 
90
  def generate_metadata(docs):
@@ -126,8 +129,9 @@ def generate_metadata(docs):
126
  }
127
  ]
128
  )
 
129
 
130
- return json.loads(chat_completion.choices[0].message.content)
131
 
132
 
133
  def analyze_metadata(filename, description, discipline):
 
5
  import openai
6
  import sys
7
  from dotenv import load_dotenv
8
+
9
  from langchain_community.document_loaders import TextLoader
10
+ from langchain_community.document_loaders import PyPDFLoader
11
  from langchain_community.document_loaders import UnstructuredPDFLoader
12
+
13
  from langchain_community.embeddings import HuggingFaceEmbeddings
14
  from langchain_community.vectorstores import Vectara
15
  from langchain_core.output_parsers import StrOutputParser
 
59
  def get_summary(documents):
60
  return documents[-1].page_content
61
 
62
+ def ingest(file_path):
63
+ try:
64
+ loader = PyPDFLoader(file_path)
65
+ documents = loader.load()
66
+ print('Loaded PyPDFLoader')
67
+ except Exception as e:
68
+ print(f'{e}')
69
  loader = UnstructuredPDFLoader(file_path)
70
+ documents = loader.load()
71
+ print('Loaded UnstructuredPDFLoader')
72
+ finally:
73
+ # transform locally
74
+ documents = loader.load()
75
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
76
+ separators=[
77
+ "\n\n",
78
+ "\n",
79
+ " ",
80
+ ",",
81
+ "\uff0c", # Fullwidth comma
82
+ "\u3001", # Ideographic comma
83
+ "\uff0e", # Fullwidth full stop
84
+ # "\u200B", # Zero-width space (Asian languages)
85
+ # "\u3002", # Ideographic full stop (Asian languages)
86
+ "",
87
+ ])
88
+ docs = text_splitter.split_documents(documents)
89
+
90
+ return docs
 
 
 
91
 
92
 
93
  def generate_metadata(docs):
 
129
  }
130
  ]
131
  )
132
+ return chat_completion.choices[0].message.content
133
 
134
+ #return json.loads(chat_completion.choices[0].message.content)
135
 
136
 
137
  def analyze_metadata(filename, description, discipline):