dj9801 commited on
Commit
ac83258
1 Parent(s): a9d4a4e

removed gemini API and improved the UI

Browse files
Files changed (5) hide show
  1. .gitignore +164 -0
  2. README.md +5 -5
  3. app.py +159 -0
  4. ocr.py +39 -0
  5. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+
162
+ #custom
163
+ .streamlit
164
+ /data
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Ask Your Pdf
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: gray
6
  sdk: streamlit
7
- sdk_version: 1.33.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: Ask Pdf
3
+ emoji: 😻
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: streamlit
7
+ sdk_version: 1.31.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import pickle
5
+ from dotenv import load_dotenv
6
+ import streamlit as st
7
+ from streamlit_chat import message
8
+ import os
9
+ from ocr import convert_pdf_to_images, extract_text_with_easyocr
10
+ from langchain.prompts import PromptTemplate
11
+ from langchain.chains import RetrievalQA
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ from langchain_community.document_loaders import PyPDFLoader
14
+ from langchain_community.vectorstores import FAISS
15
+ from langchain.docstore.document import Document
16
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
17
+ from langchain import HuggingFaceHub
18
+
19
+ load_dotenv()
20
+
21
+ # @st.cache_resource
22
+ def create_vector_store(file_path):
23
+ pdf_loader = PyPDFLoader(file_path)
24
+ docs = pdf_loader.load()
25
+ raw_text = ''
26
+ for doc in docs:
27
+ raw_text += doc.page_content
28
+
29
+ if len(raw_text) < 10:
30
+ raw_text = extract_text_with_easyocr(convert_pdf_to_images(file_path))
31
+
32
+ text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=10000, chunk_overlap=200
34
+ )
35
+ texts = text_splitter.split_text(raw_text)
36
+ # # Create multiple documents
37
+ docs = [Document(page_content=t) for t in texts]
38
+ vectorstore_faiss = FAISS.from_documents(
39
+ documents=docs,
40
+ embedding=HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base"),
41
+ )
42
+ return vectorstore_faiss
43
+
44
+ def create_prompt_template():
45
+ prompt_template = """
46
+ Human: Answer the question as a full sentence from the context provided. If you don't know the answer, don't try to make up an answer.
47
+ <context>
48
+ {context}
49
+ </context>
50
+ Question: {question}
51
+ Assistant:"""
52
+ prompt = PromptTemplate(
53
+ input_variables=["context", "question"], template=prompt_template
54
+ )
55
+ return prompt
56
+
57
+
58
+ # @st.cache_resource
59
+ def create_retrieval_chain(vector_store, prompt_template):
60
+ qa = RetrievalQA.from_chain_type(
61
+ llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"max_new_tokens": 4000}),
62
+ chain_type="stuff",
63
+ retriever=vector_store.as_retriever(
64
+ search_type="similarity", search_kwargs={"k": 6}
65
+ ),
66
+ chain_type_kwargs={"prompt": prompt_template},
67
+ )
68
+
69
+ return qa
70
+
71
+
72
+ def generate_response(chain, input_question):
73
+ answer = chain({"query": input_question})
74
+ return answer["result"]
75
+
76
+
77
+ def get_file_size(file):
78
+ file.seek(0, os.SEEK_END)
79
+ file_size_bytes = file.tell()
80
+ file_size_mb = file_size_bytes / (1024 * 1024) # Convert bytes to megabytes
81
+ file.seek(0)
82
+ return file_size_mb
83
+
84
+
85
+ # Display conversation history using Streamlit messages
86
+ def display_conversation(history):
87
+ for i in range(len(history["generated"])):
88
+ message(history["past"][i], is_user=True, key=str(i) + "_user")
89
+ if len(history["generated"][i]) == 0:
90
+ message("Please reframe your question properly", key=str(i))
91
+ else:
92
+ message(history["generated"][i],key=str(i))
93
+
94
+
95
+ def create_folders_if_not_exist(*folders):
96
+ for folder in folders:
97
+ if not os.path.exists(folder):
98
+ os.makedirs(folder)
99
+
100
+
101
+ def main():
102
+ st.set_page_config(
103
+ page_title="Ask PDF",
104
+ page_icon=":mag_right:",
105
+ layout="wide"
106
+ )
107
+
108
+ st.title("Ask PDF")
109
+ st.subheader("Unlocking Answers within Documents, Your Instant Query Companion!")
110
+
111
+ # Sidebar for file upload
112
+ st.sidebar.title("Upload PDF")
113
+ uploaded_file = st.sidebar.file_uploader("", label_visibility='collapsed', type=["pdf"])
114
+
115
+ create_folders_if_not_exist("data", "data/pdfs", "data/vectors")
116
+
117
+ if "uploaded_file" not in st.session_state or st.session_state.uploaded_file != uploaded_file:
118
+ st.session_state.uploaded_file = uploaded_file
119
+ st.session_state.generated = [f"Ask me a question about {uploaded_file.name}" if uploaded_file else ""]
120
+ st.session_state.past = ["Hey there!"]
121
+ st.session_state.last_uploaded_file = uploaded_file.name if uploaded_file else None
122
+
123
+ if uploaded_file is not None:
124
+ filepath = "data/pdfs/" + uploaded_file.name
125
+ with open(filepath, "wb") as temp_file:
126
+ temp_file.write(uploaded_file.read())
127
+ vector_file = os.path.join('data/vectors/', f'vector_store_{uploaded_file.name}.pkl')
128
+
129
+ # Display the uploaded file name in the sidebar
130
+ st.sidebar.markdown(f"**Uploaded file:** {uploaded_file.name}")
131
+
132
+ if not os.path.exists(vector_file) or "ingested_data" not in st.session_state:
133
+ with st.spinner('Embeddings are in process...'):
134
+ ingested_data = create_vector_store(filepath)
135
+ with open(vector_file, "wb") as f:
136
+ pickle.dump(ingested_data, f)
137
+ st.session_state.ingested_data = ingested_data
138
+ st.success('Embeddings are created successfully! ✅✅✅')
139
+ else:
140
+ ingested_data = st.session_state.ingested_data
141
+
142
+ prompt = create_prompt_template()
143
+ chain = create_retrieval_chain(ingested_data, prompt)
144
+
145
+ user_input = st.chat_input(placeholder="Ask a question")
146
+
147
+ if user_input:
148
+ answer = generate_response(chain, user_input)
149
+ st.session_state.past.append(user_input)
150
+ response = answer
151
+ st.session_state.generated.append(response)
152
+
153
+ # Display conversation history using Streamlit messages
154
+ if st.session_state.generated:
155
+ display_conversation(st.session_state)
156
+
157
+ if __name__ == "__main__":
158
+ main()
159
+
ocr.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ from PIL import Image
3
+ from io import BytesIO
4
+ import pypdfium2 as pdfium
5
+
6
+
7
+ def convert_pdf_to_images(file_path):
8
+ pdf_file = pdfium.PdfDocument(file_path)
9
+ page_indices = [i for i in range(len(pdf_file))]
10
+
11
+ renderer = pdf_file.render(
12
+ pdfium.PdfBitmap.to_pil,
13
+ page_indices=page_indices,
14
+ # scale=scale,
15
+ )
16
+
17
+ list_final_images = []
18
+
19
+ for i, image in zip(page_indices, renderer):
20
+ image_byte_array = BytesIO()
21
+ image.save(image_byte_array, format='jpeg', optimize=True)
22
+ image_byte_array = image_byte_array.getvalue()
23
+ list_final_images.append(dict({i: image_byte_array}))
24
+
25
+ return list_final_images
26
+
27
+ def extract_text_with_easyocr(list_dict_final_images):
28
+ language_reader = easyocr.Reader(['en'])
29
+ image_list = [list(data.values())[0] for data in list_dict_final_images]
30
+ image_content = []
31
+
32
+ for index, image_bytes in enumerate(image_list):
33
+ image = Image.open(BytesIO(image_bytes))
34
+ raw_text = language_reader.readtext(image)
35
+ raw_text = "\n".join([res[1] for res in raw_text])
36
+
37
+ image_content.append(raw_text)
38
+
39
+ return "\n".join(image_content)
requirements.txt ADDED
Binary file (6.37 kB). View file