pages/Project_2.2_-_Langchain_VectorDB.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import streamlit as st
4
+ from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
5
+
6
+ load_dotenv()
7
+
8
+ st.title("Langchain VectorDB")
9
+ st.write("This is a simple demonstration of the Langchain VectorDB.")
10
+
11
+ vector_store = initialize_inmemory_vector_store()
12
+ all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
13
+
14
+ # Index chunks
15
+ _ = vector_store.add_documents(documents=all_splits)
16
+
17
+ graph = graph_init(vector_store)
18
+
19
+ question = st.text_input("Enter a question:")
20
+ if st.button("Ask"):
21
+ st.write("Searching for an answer...")
22
+ response = graph.invoke({"question": question})
23
+ st.write(response["answer"])
pages/Project_3_-_Scrapper.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from src.functions_scrapper import scrape_website
5
+
6
+ ################################################################################
7
+ tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
8
+
9
+ st.sidebar.title("App parameters")
10
+
11
+ link = st.sidebar.text_input("Enter the link to the website you want to scrape")
12
+ selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
13
+ button = st.sidebar.button("Scrape")
14
+
15
+ ####
16
+ tab1.title("Project 3 - Scrapper")
17
+
18
+ if link and button and selector:
19
+ result = scrape_website(link, selector=selector)
20
+
21
+ tab1.write(result)
22
+
23
+
24
+
pages/Project_4_-_NLP_and_PDF_analyser.py CHANGED
@@ -1,79 +1,9 @@
1
  import streamlit as st
2
- import os
3
- from src.functions_pdf import pymupdf_pdf_to_text, pypdf2_pdf_to_text, pdfminer_pdf_to_text, pdfplumber_pdf_to_text
4
 
5
- ################################################################################
6
-
7
- # Sidebar for parameters
8
- st.sidebar.title("App Parameters")
9
-
10
- # Select method in the sidebar
11
- method = st.sidebar.selectbox(
12
- "Select the method to extract text from PDF",
13
- ("PyMuPDF", "PyPDF2", "pdfminer", "pdfplumber", "reportlab"),
14
- )
15
-
16
- # Main page title and description
17
- st.title("NLP and PDF Analyser")
18
- st.markdown(
19
- """
20
- This tool allows you to extract text from PDF files using different methods.
21
- Select a method, upload a PDF file, and extract the text.
22
- """
23
- )
24
- st.divider()
25
-
26
- # File uploader
27
- file = st.file_uploader("Upload a PDF file", type=["pdf"])
28
 
29
- if file is not None:
30
- # Display file details
31
- st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
32
-
33
- # Extract text button
34
- if st.button("Extract Text"):
35
- # Save the uploaded file to a temporary location
36
- with open("temp_uploaded_file.pdf", "wb") as temp_file:
37
- temp_file.write(file.read())
38
-
39
- # Use the temporary file path for processing
40
- temp_file_path = "temp_uploaded_file.pdf"
41
-
42
- # Extract text based on the selected method
43
- st.subheader("Extracted Text")
44
- if method == "PyMuPDF":
45
- st.write("Using **PyMuPDF** for text extraction.")
46
- text = pymupdf_pdf_to_text(temp_file_path)
47
- elif method == "PyPDF2":
48
- st.write("Using **PyPDF2** for text extraction.")
49
- text = pypdf2_pdf_to_text(temp_file_path)
50
- elif method == "pdfminer":
51
- st.write("Using **pdfminer** for text extraction.")
52
- text = pdfminer_pdf_to_text(temp_file_path)
53
- elif method == "pdfplumber":
54
- st.write("Using **pdfplumber** for text extraction.")
55
- text = pdfplumber_pdf_to_text(temp_file_path)
56
- ################################################################################
57
- # Clean up the temporary file
58
- if os.path.exists(temp_file_path):
59
- os.remove(temp_file_path)
60
- ################################################################################
61
- else:
62
- st.error("Invalid method selected.")
63
- text = ""
64
 
65
- # Display extracted text
66
- if text:
67
- st.text_area("Extracted Text", text, height=300)
68
 
69
- # Download button for extracted text
70
- st.download_button(
71
- label="Download Extracted Text",
72
- data=text,
73
- file_name="extracted_text.txt",
74
- mime="text/plain",
75
- )
76
- else:
77
- st.warning("No text extracted. Please check the PDF file or method.")
78
- else:
79
- st.warning("Please upload a PDF file to proceed.")
 
1
  import streamlit as st
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ ################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ st.sidebar.title("App parameters")
 
 
7
 
8
+ st.write("This is the NLP and PDF analyser page. It is still under construction.")
9
+ st.write("Please come back later.")
 
 
 
 
 
 
 
 
 
pages/Project_5_-_API.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ ################################################################################
5
+
6
+ st.sidebar.title("App parameters")
7
+
8
+ st.write("This is the API page. It is still under construction.")
9
+ st.write(" Please come back later.")
pages/Project_6_-_RAG.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ ################################################################################
5
+
6
+ st.sidebar.title("App parameters")
7
+
8
+ st.write("This is the RAG page. It is still under construction.")
9
+ st.write("Please come back later.")
10
+
11
+
12
+ # https://aws.amazon.com/what-is/retrieval-augmented-generation/
13
+ # https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
14
+ # https://huggingface.co/transformers/model_doc/rag.html
15
+ # https://huggingface.co/transformers/model_doc/rag-tokenizer.html
16
+
17
+ # (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
18
+ # PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
19
+ # Testing API of indeed, linkedin, pole emploi
20
+ # Testing API of huggingface
pages/Project_6_-_RAG_ED.py DELETED
@@ -1,268 +0,0 @@
1
- import streamlit as st
2
- import os
3
- from src.functions_pdf import pdfminer_pdf_to_text
4
- from src.functions_langchain import chunk_and_embed_pdf_text
5
- from src.functions_langchain import InMemoryVectorStore, graph_init, embeddings
6
- from src.functions_langchain import State, generate
7
-
8
- # https://aws.amazon.com/what-is/retrieval-augmented-generation/
9
- # https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
10
- # https://huggingface.co/transformers/model_doc/rag.html
11
- # https://huggingface.co/transformers/model_doc/rag-tokenizer.html
12
-
13
- # (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
14
- # PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
15
- # Testing API of indeed, linkedin, pole emploi
16
- # Testing API of huggingface
17
-
18
- ################################################################################
19
-
20
- # Sidebar
21
- st.sidebar.title("App Parameters")
22
- chunk_size = st.sidebar.slider("Chunk Size", 100, 2000, 1000)
23
- chunk_overlap = st.sidebar.slider("Chunk Overlap", 0, 500, 100)
24
-
25
- # Main title
26
- st.title("RAG chat with PDF")
27
- st.divider()
28
-
29
-
30
- file = st.file_uploader("Upload a PDF file", type=["pdf"])
31
- tab1, tab2 = st.tabs(["RAG", "Debugging"])
32
-
33
-
34
- def save_uploaded_file(uploaded_file):
35
- path = "temp_uploaded_file.pdf"
36
- with open(path, "wb") as f:
37
- f.write(uploaded_file.read())
38
- return path
39
-
40
- def load_and_extract_text(pdf_path):
41
- text = pdfminer_pdf_to_text(pdf_path)
42
- if os.path.exists(pdf_path):
43
- os.remove(pdf_path)
44
- return text
45
-
46
- def init_vector_store_and_graph(pdf_text, chunk_size, chunk_overlap):
47
- chunks, _ = chunk_and_embed_pdf_text(pdf_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
48
- vector_store = InMemoryVectorStore(embeddings)
49
- vector_store.add_texts(chunks)
50
- graph = graph_init(vector_store)
51
- return vector_store, graph, chunks
52
-
53
- # main tab
54
- with tab1:
55
- if file is not None:
56
- if "pdf_path" not in st.session_state or st.session_state["pdf_path"] != file.name:
57
- st.session_state["pdf_path"] = file.name
58
- st.session_state["temp_pdf_path"] = save_uploaded_file(file)
59
- st.session_state["pdf_text"] = None
60
- st.session_state["vector_store"] = None
61
- st.session_state["graph"] = None
62
- st.session_state["chunks"] = None
63
- st.session_state["state"] = None
64
-
65
- if st.button("Launch app"):
66
- with st.spinner("Extracting and processing PDF..."):
67
- text = load_and_extract_text(st.session_state["temp_pdf_path"])
68
- if not text:
69
- st.warning("No text extracted from PDF.")
70
- else:
71
- st.session_state["pdf_text"] = text
72
- vector_store, graph, chunks = init_vector_store_and_graph(text, chunk_size, chunk_overlap)
73
- st.session_state["vector_store"] = vector_store
74
- st.session_state["graph"] = graph
75
- st.session_state["chunks"] = chunks
76
- st.success(f"Processed PDF with {len(chunks)} chunks.")
77
-
78
- if "graph" in st.session_state and st.session_state["graph"] is not None:
79
- query = st.text_input("Ask a question about the PDF:", key="query_tab1")
80
- if query:
81
- state = State(question=query, context=[], answer="")
82
- st.session_state["state"] = state
83
- with st.spinner("Retrieving context and generating answer..."):
84
- result_state = st.session_state["graph"].invoke(state)
85
- st.session_state["state"] = result_state
86
-
87
- if result_state.get("context"):
88
- st.success(f"Retrieved {len(result_state['context'])} relevant documents.")
89
- st.markdown("### Answer:")
90
- st.write(result_state.get("answer", "No answer generated."))
91
- else:
92
- st.warning("No relevant context found for the question.")
93
-
94
- # Debugging tab
95
- with tab2:
96
- if file is not None:
97
- st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
98
- if st.button("Extract Text"):
99
- temp_pdf_path = save_uploaded_file(file)
100
- text = load_and_extract_text(temp_pdf_path)
101
- if text:
102
- st.success("Text extracted successfully!")
103
- st.session_state["pdf_text"] = text
104
- st.text_area("Extracted Text", text, height=300)
105
- st.download_button("Download Extracted Text", text, "extracted_text.txt", "text/plain")
106
- else:
107
- st.warning("No text extracted. Please check the PDF.")
108
-
109
- if "pdf_text" in st.session_state and st.session_state["pdf_text"]:
110
- if st.button("Process and Embed Text"):
111
- with st.spinner("Chunking and embedding text..."):
112
- vector_store, graph, chunks = init_vector_store_and_graph(st.session_state["pdf_text"], chunk_size, chunk_overlap)
113
- st.session_state["vector_store"] = vector_store
114
- st.session_state["graph"] = graph
115
- st.session_state["chunks"] = chunks
116
- st.success(f"Processed {len(chunks)} chunks and created embeddings.")
117
- for i, chunk in enumerate(chunks[:3]):
118
- st.markdown(f"**Chunk {i+1}:**")
119
- st.write(chunk)
120
-
121
- if "graph" in st.session_state and st.session_state["graph"] is not None:
122
- query_debug = st.text_input("Ask a question about the PDF:", key="query_tab2")
123
- if query_debug:
124
- state = State(question=query_debug, context=[], answer="")
125
- st.session_state["state"] = state
126
- with st.spinner("Retrieving context and generating answer..."):
127
- result_state = st.session_state["graph"].invoke(state)
128
- st.session_state["state"] = result_state
129
- if result_state.get("context"):
130
- st.success(f"Retrieved {len(result_state['context'])} documents.")
131
- st.markdown("### Answer:")
132
- st.write(result_state.get("answer", "No answer generated."))
133
- else:
134
- st.warning("No relevant context found for the question.")
135
-
136
-
137
- # with tab1:
138
- # # Upload PDF
139
-
140
- # if file is not None:
141
- # temp_file_path = "temp_uploaded_file.pdf"
142
- # with open(temp_file_path, "wb") as temp_file:
143
- # temp_file.write(file.read())
144
-
145
- # if st.button("Launch app"):
146
- # with st.spinner("Preloading information..."):
147
- # text = pdfminer_pdf_to_text(temp_file_path)
148
- # st.session_state["pdf_text"] = text
149
-
150
- # vector_store = InMemoryVectorStore(embeddings)
151
- # chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
152
-
153
- # vector_store = InMemoryVectorStore(embeddings)
154
- # vector_store.add_texts(chunks)
155
-
156
- # st.session_state["vector_store"] = vector_store
157
- # st.session_state["graph"] = graph_init(vector_store)
158
-
159
- # st.success("App is ready to use!")
160
-
161
- # if "graph" in st.session_state:
162
- # query = st.text_input("Ask a question about the PDF:")
163
- # if query:
164
- # state = State(question=query, context=[], answer="")
165
- # st.session_state["state"] = state
166
-
167
- # with st.spinner("Retrieving context..."):
168
- # context = st.session_state["graph"].invoke(state)
169
- # st.session_state["state"]["context"] = context["context"]
170
-
171
- # if st.session_state["state"]["context"]:
172
- # st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
173
-
174
- # with st.spinner("Generating answer..."):
175
- # answer = generate(st.session_state["state"])
176
- # st.session_state["state"]["answer"] = answer["answer"]
177
-
178
- # st.markdown("### Answer:")
179
- # st.write(st.session_state["state"]["answer"])
180
- # else:
181
- # st.warning("No relevant context found for the question.")
182
-
183
-
184
-
185
- # with tab2:
186
- # ### FIRST ETAPE ----UPLOAD THE PDF-FILE AND RETURN THE TEXT RESULT ----
187
-
188
- # if file is not None:
189
- # st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
190
-
191
- # if st.button("Extract Text"):
192
- # temp_file_path = "temp_uploaded_file.pdf"
193
-
194
- # with open(temp_file_path, "wb") as temp_file:
195
- # temp_file.write(file.read())
196
-
197
- # text = pdfminer_pdf_to_text(temp_file_path)
198
-
199
- # if os.path.exists(temp_file_path):
200
- # os.remove(temp_file_path)
201
-
202
- # if text:
203
- # st.success("Text extracted successfully!")
204
- # st.session_state["pdf_text"] = text
205
-
206
- # if st.checkbox("Show extracted text"):
207
- # st.text_area("Extracted Text", text, height=300)
208
-
209
- # st.download_button(
210
- # label="Download Extracted Text",
211
- # data=text,
212
- # file_name="extracted_text.txt",
213
- # mime="text/plain"
214
- # )
215
- # else:
216
- # st.warning("No text extracted. Please check the PDF.")
217
- # else:
218
- # st.warning("Please upload a PDF file to proceed.")
219
-
220
-
221
- # # SECOND ETAPE ---- New button and logic for chunking & embedding ( no mongo db, only session state ) ----
222
-
223
-
224
- # vector_store = InMemoryVectorStore(embeddings)
225
-
226
-
227
- # if "pdf_text" in st.session_state:
228
- # if st.button("Process and Embed Text"):
229
- # with st.spinner("Chunking and embedding text..."):
230
- # chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
231
-
232
- # # Initialize vector store and add texts
233
- # vector_store = InMemoryVectorStore(embeddings)
234
- # vector_store.add_texts(chunks)
235
-
236
- # # Save vector store and graph in session state
237
- # st.session_state["vector_store"] = vector_store
238
- # st.session_state["graph"] = graph_init(vector_store)
239
-
240
- # st.success(f"Processed {len(chunks)} chunks and created embeddings.")
241
- # for i, chunk in enumerate(chunks[:3]):
242
- # st.markdown(f"**Chunk {i+1}:**")
243
- # st.write(chunk)
244
-
245
-
246
- # # THIRD ETAPE ---- Add a question and answer logic ----
247
-
248
- # if "graph" in st.session_state:
249
- # query = st.text_input("Ask a question about the PDF:")
250
- # if query:
251
- # state = State(question=query, context=[], answer="")
252
- # st.session_state["state"] = state
253
-
254
- # with st.spinner("Retrieving context..."):
255
- # context = st.session_state["graph"].invoke(state)
256
- # st.session_state["state"]["context"] = context["context"]
257
-
258
- # if st.session_state["state"]["context"]:
259
- # st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
260
-
261
- # with st.spinner("Generating answer..."):
262
- # answer = generate(st.session_state["state"])
263
- # st.session_state["state"]["answer"] = answer["answer"]
264
-
265
- # st.markdown("### Answer:")
266
- # st.write(st.session_state["state"]["answer"])
267
- # else:
268
- # st.warning("No relevant context found for the question.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/archive/Project_2.2_-_Langchain_VectorDB.py DELETED
@@ -1,23 +0,0 @@
1
- # import os
2
- # from dotenv import load_dotenv
3
- # import streamlit as st
4
- # from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
5
-
6
- # load_dotenv()
7
-
8
- # st.title("Langchain VectorDB")
9
- # st.write("This is a simple demonstration of the Langchain VectorDB.")
10
-
11
- # vector_store = initialize_inmemory_vector_store()
12
- # all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
13
-
14
- # # Index chunks
15
- # _ = vector_store.add_documents(documents=all_splits)
16
-
17
- # graph = graph_init(vector_store)
18
-
19
- # question = st.text_input("Enter a question:")
20
- # if st.button("Ask"):
21
- # st.write("Searching for an answer...")
22
- # response = graph.invoke({"question": question})
23
- # st.write(response["answer"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/archive/Project_3_-_Scrapper.py DELETED
@@ -1,24 +0,0 @@
1
- # import streamlit as st
2
- # import requests
3
- # from bs4 import BeautifulSoup
4
- # from src.functions_scrapper import scrape_website
5
-
6
- # ################################################################################
7
- # tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
8
-
9
- # st.sidebar.title("App parameters")
10
-
11
- # link = st.sidebar.text_input("Enter the link to the website you want to scrape")
12
- # selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
13
- # button = st.sidebar.button("Scrape")
14
-
15
- # ####
16
- # tab1.title("Project 3 - Scrapper")
17
-
18
- # if link and button and selector:
19
- # result = scrape_website(link, selector=selector)
20
-
21
- # tab1.write(result)
22
-
23
-
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/archive/Project_5_-_API.py DELETED
@@ -1,9 +0,0 @@
1
- # import streamlit as st
2
-
3
-
4
- # ################################################################################
5
-
6
- # st.sidebar.title("App parameters")
7
-
8
- # st.write("This is the API page. It is still under construction.")
9
- # st.write(" Please come back later.")
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -14,10 +14,4 @@ langchain-core
14
  langgraph>0.2.27
15
  sentry-sdk
16
  langchain-mongodb
17
- langchain-huggingface
18
- PyMuPDF
19
- PyPDF2
20
- pdfminer.six
21
- pdfplumber
22
- sentence-transformers
23
- langchain-text-splitters
 
14
  langgraph>0.2.27
15
  sentry-sdk
16
  langchain-mongodb
17
+ langchain-huggingface
 
 
 
 
 
 
src/__pycache__/functions_langchain.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_langchain.cpython-311.pyc and b/src/__pycache__/functions_langchain.cpython-311.pyc differ
 
src/__pycache__/functions_llm.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_llm.cpython-311.pyc and b/src/__pycache__/functions_llm.cpython-311.pyc differ
 
src/__pycache__/functions_nadia_llm.cpython-311.pyc DELETED
Binary file (743 Bytes)
 
src/__pycache__/functions_pdf.cpython-311.pyc DELETED
Binary file (2.7 kB)
 
src/__pycache__/functions_scrapper.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_scrapper.cpython-311.pyc and b/src/__pycache__/functions_scrapper.cpython-311.pyc differ
 
src/functions_langchain.py CHANGED
@@ -20,8 +20,6 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
20
  from langgraph.graph import START, StateGraph
21
  from typing_extensions import List, TypedDict
22
  from langchain_core.vectorstores import InMemoryVectorStore
23
- from langchain.text_splitter import RecursiveCharacterTextSplitter
24
- from langchain_community.embeddings import HuggingFaceEmbeddings
25
 
26
  load_dotenv()
27
 
@@ -38,32 +36,12 @@ sentry_sdk.init(
38
  },
39
  )
40
 
41
- # client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
42
 
43
  llm = init_chat_model("llama3-8b-8192", model_provider="groq")
44
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
45
  prompt = hub.pull("rlm/rag-prompt")
46
 
47
- def chunk_and_embed_pdf_text(text: str, chunk_size=1000, chunk_overlap=100):
48
- # 1. Split text into chunks
49
- text_splitter = RecursiveCharacterTextSplitter(
50
- chunk_size=chunk_size, # size of each chunk in characters
51
- chunk_overlap=chunk_overlap, # overlap to preserve context
52
- separators=["\n\n", "\n", ".", " "]
53
- )
54
- chunks = text_splitter.split_text(text)
55
-
56
- # 2. Create HuggingFace embeddings instance
57
- embeddings = HuggingFaceEmbeddings(
58
- model_name="sentence-transformers/all-mpnet-base-v2"
59
- )
60
-
61
- # 3. Embed chunks
62
- vectors = embeddings.embed_documents(chunks)
63
-
64
- # Returning both for further processing
65
- return chunks, vectors
66
-
67
  @serverless_function
68
  def initialize_inmemory_vector_store() -> InMemoryVectorStore:
69
  return InMemoryVectorStore(embeddings)
 
20
  from langgraph.graph import START, StateGraph
21
  from typing_extensions import List, TypedDict
22
  from langchain_core.vectorstores import InMemoryVectorStore
 
 
23
 
24
  load_dotenv()
25
 
 
36
  },
37
  )
38
 
39
+ client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
40
 
41
  llm = init_chat_model("llama3-8b-8192", model_provider="groq")
42
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
43
  prompt = hub.pull("rlm/rag-prompt")
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @serverless_function
46
  def initialize_inmemory_vector_store() -> InMemoryVectorStore:
47
  return InMemoryVectorStore(embeddings)
src/functions_pdf.py DELETED
@@ -1,77 +0,0 @@
1
- import pymupdf
2
- from PyPDF2 import PdfReader
3
- from pdfminer.high_level import extract_text
4
- from langchain.document_loaders import PDFPlumberLoader
5
- import streamlit as st
6
-
7
- def pymupdf_pdf_to_text(file_path):
8
- """
9
- Extract text from a PDF file using PyMuPDF.
10
-
11
- Args:
12
- file_path (str): Path to the PDF file.
13
-
14
- Returns:
15
- str: Extracted text from the PDF file.
16
- """
17
- doc = pymupdf.open(stream=file_path.read(), filetype="pdf")
18
- text = ""
19
- for page in doc:
20
- text += page.get_text() + "\n"
21
- return text
22
-
23
- def pypdf2_pdf_to_text(file_path):
24
- """
25
- Extract text from a PDF file using PyPDF2.
26
-
27
- Args:
28
- file_path (str): Path to the PDF file.
29
-
30
- Returns:
31
- str: Extracted text from the PDF file.
32
- """
33
- reader = PdfReader(file_path)
34
- text = ""
35
- for page in reader.pages:
36
- text += page.extract_text() + "\n"
37
- return text
38
-
39
- # def pdfminer_pdf_to_text(file_path):
40
- # """
41
- # Extract text from a PDF file using pdfminer.
42
-
43
- # Args:
44
- # file_path (str): Path to the PDF file.
45
-
46
- # Returns:
47
- # str: Extracted text from the PDF file.
48
- # """
49
- # # Implementation for pdfminer extraction goes here
50
- # text = extract_text(file_path)
51
- # return text
52
-
53
- def pdfminer_pdf_to_text(pdf_path: str) -> str:
54
- try:
55
- text = extract_text(pdf_path)
56
- return text.strip()
57
- except Exception as e:
58
- st.error(f"Error extracting text: {e}")
59
- return ""
60
-
61
- def pdfplumber_pdf_to_text(file_path):
62
- """
63
- Extract text from a PDF file using pdfplumber.
64
-
65
- Args:
66
- file_path (str): Path to the PDF file.
67
-
68
- Returns:
69
- str: Extracted text from the PDF file.
70
- """
71
- loader = PDFPlumberLoader(file_path)
72
- documents = loader.load()
73
- text = ""
74
- for doc in documents:
75
- text += doc.page_content + "\n"
76
- return text
77
-