tahirsher commited on
Commit
ed3b297
β€’
1 Parent(s): 3cf9170

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -6
app.py CHANGED
@@ -1,21 +1,28 @@
1
  import os
 
 
 
2
  from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.vectorstores import FAISS
5
- import streamlit as st
6
- import requests
7
- from io import BytesIO
8
 
9
  # Set up Groq API key
10
  GROQ_API_KEY = os.getenv("GROQ_Api_Key")
11
 
 
 
 
 
12
  # List of GitHub PDF URLs
13
  PDF_URLS = [
14
- "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
15
  "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
 
16
  # Add more document links as needed
17
  ]
18
 
 
19
  def fetch_pdf_text_from_github(urls):
20
  text = ""
21
  for url in urls:
@@ -34,16 +41,30 @@ def fetch_pdf_text_from_github(urls):
34
  st.error(f"Failed to fetch PDF from URL: {url}")
35
  return text
36
 
 
37
  @st.cache_data
38
  def get_text_chunks(text):
39
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
40
  chunks = text_splitter.split_text(text)
41
  return chunks
42
 
 
 
 
 
 
 
 
 
 
 
 
43
  @st.cache_resource
44
  def load_or_create_vector_store(text_chunks):
45
- embeddings = FAISS.get_default_embeddings()
46
- vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
 
 
47
  return vector_store
48
 
49
  # Call Groq API for generating summary based on the query and retrieved text
@@ -66,11 +87,13 @@ def generate_summary_with_groq(query, retrieved_text):
66
  st.error("Failed to generate summary with Groq API")
67
  return "Error in Groq API response"
68
 
 
69
  def user_input(user_question, vector_store):
70
  docs = vector_store.similarity_search(user_question)
71
  context_text = " ".join([doc.page_content for doc in docs])
72
  return generate_summary_with_groq(user_question, context_text)
73
 
 
74
  def main():
75
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
76
  st.title("πŸ“„ Query PDF Documents on GitHub")
 
1
  import os
2
+ import requests
3
+ import streamlit as st
4
+ from io import BytesIO
5
  from PyPDF2 import PdfReader
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
+ from transformers import AutoModel, AutoTokenizer
9
+ import torch
 
10
 
11
  # Set up Groq API key
12
  GROQ_API_KEY = os.getenv("GROQ_Api_Key")
13
 
14
+ # Initialize embedding model (using sentence-transformers model)
15
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
16
+ embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
17
+
18
  # List of GitHub PDF URLs
19
  PDF_URLS = [
 
20
  "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
21
+ "https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
22
  # Add more document links as needed
23
  ]
24
 
25
+ # Fetch and extract text from PDF files hosted on GitHub
26
  def fetch_pdf_text_from_github(urls):
27
  text = ""
28
  for url in urls:
 
41
  st.error(f"Failed to fetch PDF from URL: {url}")
42
  return text
43
 
44
+ # Split text into manageable chunks
45
  @st.cache_data
46
  def get_text_chunks(text):
47
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
48
  chunks = text_splitter.split_text(text)
49
  return chunks
50
 
51
+ # Compute embeddings for text chunks
52
+ def compute_embeddings(text_chunks):
53
+ embeddings = []
54
+ for text in text_chunks:
55
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
56
+ with torch.no_grad():
57
+ model_output = embedding_model(**inputs)
58
+ embeddings.append(model_output.last_hidden_state.mean(dim=1).squeeze().numpy())
59
+ return embeddings
60
+
61
+ # Create a FAISS vector store with embeddings
62
  @st.cache_resource
63
  def load_or_create_vector_store(text_chunks):
64
+ # Compute embeddings for text chunks
65
+ embeddings = compute_embeddings(text_chunks)
66
+ # Create FAISS vector store
67
+ vector_store = FAISS.from_texts(text_chunks, embeddings)
68
  return vector_store
69
 
70
  # Call Groq API for generating summary based on the query and retrieved text
 
87
  st.error("Failed to generate summary with Groq API")
88
  return "Error in Groq API response"
89
 
90
+ # Generate response for user query
91
  def user_input(user_question, vector_store):
92
  docs = vector_store.similarity_search(user_question)
93
  context_text = " ".join([doc.page_content for doc in docs])
94
  return generate_summary_with_groq(user_question, context_text)
95
 
96
+ # Main function to run the Streamlit app
97
  def main():
98
  st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="πŸ“„")
99
  st.title("πŸ“„ Query PDF Documents on GitHub")