Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,28 @@
|
|
1 |
import os
|
|
|
|
|
|
|
2 |
from PyPDF2 import PdfReader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain_community.vectorstores import FAISS
|
5 |
-
import
|
6 |
-
import
|
7 |
-
from io import BytesIO
|
8 |
|
9 |
# Set up Groq API key
|
10 |
GROQ_API_KEY = os.getenv("GROQ_Api_Key")
|
11 |
|
|
|
|
|
|
|
|
|
12 |
# List of GitHub PDF URLs
|
13 |
PDF_URLS = [
|
14 |
-
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
|
15 |
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
|
|
|
16 |
# Add more document links as needed
|
17 |
]
|
18 |
|
|
|
19 |
def fetch_pdf_text_from_github(urls):
|
20 |
text = ""
|
21 |
for url in urls:
|
@@ -34,16 +41,30 @@ def fetch_pdf_text_from_github(urls):
|
|
34 |
st.error(f"Failed to fetch PDF from URL: {url}")
|
35 |
return text
|
36 |
|
|
|
37 |
@st.cache_data
|
38 |
def get_text_chunks(text):
|
39 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
40 |
chunks = text_splitter.split_text(text)
|
41 |
return chunks
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
@st.cache_resource
|
44 |
def load_or_create_vector_store(text_chunks):
|
45 |
-
embeddings
|
46 |
-
|
|
|
|
|
47 |
return vector_store
|
48 |
|
49 |
# Call Groq API for generating summary based on the query and retrieved text
|
@@ -66,11 +87,13 @@ def generate_summary_with_groq(query, retrieved_text):
|
|
66 |
st.error("Failed to generate summary with Groq API")
|
67 |
return "Error in Groq API response"
|
68 |
|
|
|
69 |
def user_input(user_question, vector_store):
|
70 |
docs = vector_store.similarity_search(user_question)
|
71 |
context_text = " ".join([doc.page_content for doc in docs])
|
72 |
return generate_summary_with_groq(user_question, context_text)
|
73 |
|
|
|
74 |
def main():
|
75 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
|
76 |
st.title("π Query PDF Documents on GitHub")
|
|
|
1 |
import os
|
2 |
+
import requests
|
3 |
+
import streamlit as st
|
4 |
+
from io import BytesIO
|
5 |
from PyPDF2 import PdfReader
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
+
from transformers import AutoModel, AutoTokenizer
|
9 |
+
import torch
|
|
|
10 |
|
11 |
# Set up Groq API key
|
12 |
GROQ_API_KEY = os.getenv("GROQ_Api_Key")
|
13 |
|
14 |
+
# Initialize embedding model (using sentence-transformers model)
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
16 |
+
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
|
17 |
+
|
18 |
# List of GitHub PDF URLs
|
19 |
PDF_URLS = [
|
|
|
20 |
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi-partite.pdf",
|
21 |
+
"https://github.com/TahirSher/GenAI_Lawyers_Guide/blob/main/bi%20pat%20graphs.pdf",
|
22 |
# Add more document links as needed
|
23 |
]
|
24 |
|
25 |
+
# Fetch and extract text from PDF files hosted on GitHub
|
26 |
def fetch_pdf_text_from_github(urls):
|
27 |
text = ""
|
28 |
for url in urls:
|
|
|
41 |
st.error(f"Failed to fetch PDF from URL: {url}")
|
42 |
return text
|
43 |
|
44 |
+
# Split text into manageable chunks
|
45 |
@st.cache_data
|
46 |
def get_text_chunks(text):
|
47 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
48 |
chunks = text_splitter.split_text(text)
|
49 |
return chunks
|
50 |
|
51 |
+
# Compute embeddings for text chunks
|
52 |
+
def compute_embeddings(text_chunks):
|
53 |
+
embeddings = []
|
54 |
+
for text in text_chunks:
|
55 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
56 |
+
with torch.no_grad():
|
57 |
+
model_output = embedding_model(**inputs)
|
58 |
+
embeddings.append(model_output.last_hidden_state.mean(dim=1).squeeze().numpy())
|
59 |
+
return embeddings
|
60 |
+
|
61 |
+
# Create a FAISS vector store with embeddings
|
62 |
@st.cache_resource
|
63 |
def load_or_create_vector_store(text_chunks):
|
64 |
+
# Compute embeddings for text chunks
|
65 |
+
embeddings = compute_embeddings(text_chunks)
|
66 |
+
# Create FAISS vector store
|
67 |
+
vector_store = FAISS.from_texts(text_chunks, embeddings)
|
68 |
return vector_store
|
69 |
|
70 |
# Call Groq API for generating summary based on the query and retrieved text
|
|
|
87 |
st.error("Failed to generate summary with Groq API")
|
88 |
return "Error in Groq API response"
|
89 |
|
90 |
+
# Generate response for user query
|
91 |
def user_input(user_question, vector_store):
|
92 |
docs = vector_store.similarity_search(user_question)
|
93 |
context_text = " ".join([doc.page_content for doc in docs])
|
94 |
return generate_summary_with_groq(user_question, context_text)
|
95 |
|
96 |
+
# Main function to run the Streamlit app
|
97 |
def main():
|
98 |
st.set_page_config(page_title="RAG-based PDF Chat", layout="centered", page_icon="π")
|
99 |
st.title("π Query PDF Documents on GitHub")
|