ZeeQazi commited on
Commit
0d7bb6b
β€’
1 Parent(s): 39fc517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -71
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
  import streamlit as st
3
  import pdfplumber
4
- from concurrent.futures import ThreadPoolExecutor
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.embeddings import HuggingFaceEmbeddings
7
  from langchain.vectorstores import FAISS
8
- from transformers import pipeline
9
 
10
  # Set up the page configuration
11
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
@@ -13,29 +12,42 @@ st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon
13
  # Load the summarization pipeline model
14
  @st.cache_resource
15
  def load_summarization_pipeline():
16
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
- return summarizer
18
 
19
  summarizer = load_summarization_pipeline()
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # Split text into manageable chunks
22
  @st.cache_data
23
  def get_text_chunks(text):
24
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
25
- chunks = text_splitter.split_text(text)
26
- return chunks
27
 
28
  # Initialize embedding function
29
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
 
31
- # Create a FAISS vector store with embeddings, checking for empty chunks
32
  @st.cache_resource
33
  def load_or_create_vector_store(text_chunks):
34
- if not text_chunks:
35
- st.error("No valid text chunks found to create a vector store. Please check your PDF files.")
36
- return None
37
- vector_store = FAISS.from_texts(text_chunks, embedding=embedding_function)
38
- return vector_store
39
 
40
  # Helper function to process a single PDF
41
  def process_single_pdf(file_path):
@@ -50,65 +62,47 @@ def process_single_pdf(file_path):
50
  st.error(f"Failed to read PDF: {file_path} - {e}")
51
  return text
52
 
53
- # Function to load PDFs with progress display
54
  def load_pdfs_with_progress(folder_path):
 
 
 
 
55
  all_text = ""
56
  pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
57
- num_files = len(pdf_files)
58
-
59
- if num_files == 0:
60
  st.error("No PDF files found in the specified folder.")
61
- st.session_state['vector_store'] = None
62
- st.session_state['loading'] = False
63
- return
64
 
65
- # Title for the progress bar
66
  st.markdown("### Loading data...")
67
  progress_bar = st.progress(0)
68
- status_text = st.empty()
69
-
70
- processed_count = 0
71
-
72
- for file_path in pdf_files:
73
- result = process_single_pdf(file_path)
74
- all_text += result
75
- processed_count += 1
76
- progress_percentage = int((processed_count / num_files) * 100)
77
- progress_bar.progress(processed_count / num_files)
78
- status_text.text(f"Loading documents: {progress_percentage}% completed")
79
-
80
- progress_bar.empty() # Remove the progress bar when done
81
- status_text.text("Document loading completed!") # Show completion message
82
-
83
- if all_text:
84
- text_chunks = get_text_chunks(all_text)
85
- vector_store = load_or_create_vector_store(text_chunks)
86
- st.session_state['vector_store'] = vector_store
87
- else:
88
- st.session_state['vector_store'] = None
89
-
90
- st.session_state['loading'] = False # Mark loading as complete
91
-
92
- # Generate summary based on the retrieved text
93
- def generate_summary_with_huggingface(query, retrieved_text):
94
- summarization_input = f"{query} Related information:{retrieved_text}"
95
- max_input_length = 1024
96
- summarization_input = summarization_input[:max_input_length]
97
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
98
  return summary[0]["summary_text"]
99
 
100
- # Generate response for user query
101
- def user_input(user_question):
102
- vector_store = st.session_state.get('vector_store')
103
- if vector_store is None:
104
- return "The app is still loading documents or no documents were successfully loaded."
105
- docs = vector_store.similarity_search(user_question)
106
- context_text = " ".join([doc.page_content for doc in docs])
107
- return generate_summary_with_huggingface(user_question, context_text)
 
 
108
 
109
  # Main function to run the Streamlit app
110
  def main():
111
- # Use HTML to style the title with a larger font size
112
  st.markdown(
113
  """
114
  <h1 style="font-size:30px; text-align: center;">
@@ -118,23 +112,24 @@ def main():
118
  unsafe_allow_html=True
119
  )
120
 
121
- # Start loading documents if not already loaded
122
- if 'loading' not in st.session_state or st.session_state['loading']:
123
- st.session_state['loading'] = True
124
- load_pdfs_with_progress('documents1')
125
 
 
126
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
127
 
128
- if st.session_state.get('loading', True):
129
- st.info("The app is loading documents in the background. You can type your question now and submit once loading is complete.")
130
 
131
- if st.button("Get Response"):
132
- if not user_question:
133
- st.warning("Please enter a question before submitting.")
134
- else:
135
- with st.spinner("Generating response..."):
136
- answer = user_input(user_question)
137
- st.markdown(f"**πŸ€– AI:** {answer}")
138
 
139
  if __name__ == "__main__":
140
  main()
 
1
  import os
2
  import streamlit as st
3
  import pdfplumber
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.vectorstores import FAISS
7
+ from transformers import pipeline, M2M100ForConditionalGeneration, AutoTokenizer
8
 
9
  # Set up the page configuration
10
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
 
12
  # Load the summarization pipeline model
13
  @st.cache_resource
14
  def load_summarization_pipeline():
15
+ return pipeline("summarization", model="facebook/bart-large-cnn")
 
16
 
17
  summarizer = load_summarization_pipeline()
18
 
19
+ # Load the translation model
20
+ @st.cache_resource
21
+ def load_translation_model():
22
+ model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
23
+ tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
24
+ return model, tokenizer
25
+
26
+ translation_model, translation_tokenizer = load_translation_model()
27
+
28
+ # Define available languages for translation
29
+ LANGUAGES = {
30
+ "English": "en",
31
+ "French": "fr",
32
+ "Spanish": "es",
33
+ "Chinese": "zh",
34
+ "Hindi": "hi",
35
+ "Urdu": "ur",
36
+ }
37
+
38
  # Split text into manageable chunks
39
  @st.cache_data
40
  def get_text_chunks(text):
41
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
42
+ return text_splitter.split_text(text)
 
43
 
44
  # Initialize embedding function
45
  embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
46
 
47
+ # Create a FAISS vector store with embeddings
48
  @st.cache_resource
49
  def load_or_create_vector_store(text_chunks):
50
+ return FAISS.from_texts(text_chunks, embedding=embedding_function) if text_chunks else None
 
 
 
 
51
 
52
  # Helper function to process a single PDF
53
  def process_single_pdf(file_path):
 
62
  st.error(f"Failed to read PDF: {file_path} - {e}")
63
  return text
64
 
65
+ # Load PDFs with progress display
66
  def load_pdfs_with_progress(folder_path):
67
+ if not os.path.exists(folder_path):
68
+ st.error(f"The folder '{folder_path}' does not exist. Please create it and add PDF files.")
69
+ return None
70
+
71
  all_text = ""
72
  pdf_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.pdf')]
73
+ if not pdf_files:
 
 
74
  st.error("No PDF files found in the specified folder.")
75
+ return None
 
 
76
 
 
77
  st.markdown("### Loading data...")
78
  progress_bar = st.progress(0)
79
+
80
+ for i, file_path in enumerate(pdf_files):
81
+ all_text += process_single_pdf(file_path)
82
+ progress_bar.progress((i + 1) / len(pdf_files))
83
+
84
+ progress_bar.empty()
85
+ return load_or_create_vector_store(get_text_chunks(all_text)) if all_text else None
86
+
87
+ # Generate summary based on retrieved text
88
+ def generate_summary(query, retrieved_text):
89
+ summarization_input = f"{query} Related information:{retrieved_text}"[:1024]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  summary = summarizer(summarization_input, max_length=500, min_length=50, do_sample=False)
91
  return summary[0]["summary_text"]
92
 
93
+ # Translate text to selected language
94
+ def translate_text(text, target_lang_code):
95
+ # Set the target language
96
+ translation_tokenizer.src_lang = "en" # assuming the original language is English
97
+ translation_tokenizer.tgt_lang = target_lang_code # dynamically use the selected target language code
98
+
99
+ # Tokenize the text and generate the translation
100
+ encoded_text = translation_tokenizer(text, return_tensors="pt")
101
+ generated_tokens = translation_model.generate(**encoded_text, forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_lang_code])
102
+ return translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
103
 
104
  # Main function to run the Streamlit app
105
  def main():
 
106
  st.markdown(
107
  """
108
  <h1 style="font-size:30px; text-align: center;">
 
112
  unsafe_allow_html=True
113
  )
114
 
115
+ if "vector_store" not in st.session_state:
116
+ st.session_state["vector_store"] = load_pdfs_with_progress('documents1')
117
+ if st.session_state["vector_store"] is None:
118
+ return
119
 
120
+ # Prompt input
121
  user_question = st.text_input("Ask a Question:", placeholder="Type your question here...")
122
 
123
+ # Language selection dropdown
124
+ selected_language = st.selectbox("Select output language:", list(LANGUAGES.keys()))
125
 
126
+ if user_question and st.button("Get Response"):
127
+ with st.spinner("Generating response..."):
128
+ docs = st.session_state["vector_store"].similarity_search(user_question)
129
+ context_text = " ".join([doc.page_content for doc in docs])
130
+ answer = generate_summary(user_question, context_text)
131
+ translated_answer = translate_text(answer, LANGUAGES[selected_language])
132
+ st.markdown(f"**πŸ€– AI ({selected_language}):** {translated_answer}")
133
 
134
  if __name__ == "__main__":
135
  main()