Narayana02 commited on
Commit
f390c59
·
verified ·
1 Parent(s): 7eab4a2

Upload 4 files

Browse files
Files changed (4) hide show
  1. .env +1 -0
  2. app.py +59 -0
  3. requirements.txt +6 -0
  4. utilities.py +65 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY = 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
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from utils import (
5
+ extract_text_from_pdf,
6
+ build_hierarchical_tree,
7
+ save_tree,
8
+ hybrid_retrieval,
9
+ rag_answer,
10
+ )
11
+
12
+ # Load API key from .env
13
+ load_dotenv()
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+
16
+ # Create necessary directories
17
+ os.makedirs("uploaded_textbooks", exist_ok=True)
18
+ os.makedirs("hierarchical_trees", exist_ok=True)
19
+ os.makedirs("retrieved_contexts", exist_ok=True)
20
+
21
+ # Streamlit UI
22
+ st.title("Hierarchical Question-Answering System 📚🤖")
23
+ st.markdown(
24
+ "Upload textbooks, explore their structure, and ask questions powered by AI."
25
+ )
26
+
27
+ # Upload PDF section
28
+ uploaded_files = st.file_uploader("Upload Textbooks (PDF)", type=["pdf"], accept_multiple_files=True)
29
+
30
+ if uploaded_files:
31
+ for uploaded_file in uploaded_files:
32
+ file_path = os.path.join("uploaded_textbooks", uploaded_file.name)
33
+ with open(file_path, "wb") as f:
34
+ f.write(uploaded_file.read())
35
+
36
+ # Extract text
37
+ st.write(f"Processing: {uploaded_file.name}")
38
+ extracted_text = extract_text_from_pdf(file_path)
39
+
40
+ # Build hierarchical tree
41
+ tree = build_hierarchical_tree(extracted_text, textbook_title=uploaded_file.name)
42
+ tree_path = os.path.join("hierarchical_trees", f"{uploaded_file.name}_tree.json")
43
+ save_tree(tree, tree_path)
44
+
45
+ st.success(f"Processed and indexed: {uploaded_file.name}")
46
+
47
+ # Query Section
48
+ query = st.text_input("Ask a question:")
49
+ if query:
50
+ st.write("Retrieving relevant information...")
51
+ relevant_text = hybrid_retrieval(query, OPENAI_API_KEY)
52
+ if relevant_text:
53
+ st.write("Generating an answer...")
54
+ answer = rag_answer(query, relevant_text, OPENAI_API_KEY)
55
+ st.write(f"**Answer:** {answer}")
56
+ st.write("**Relevant Context:**")
57
+ st.write(relevant_text)
58
+ else:
59
+ st.write("No relevant information found.")
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ networkx
4
+ sentence-transformers
5
+ openai
6
+ transformers
utilities.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import json
3
+ import networkx as nx
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import openai
6
+
7
+ # Model for embeddings
8
+ model = SentenceTransformer("all-MiniLM-L6-v2")
9
+
10
+ # 1. Extract Text from PDF
11
+ def extract_text_from_pdf(file_path):
12
+ """Extract text from a PDF."""
13
+ text = ""
14
+ with open(file_path, "rb") as f:
15
+ reader = PyPDF2.PdfReader(f)
16
+ for page in reader.pages:
17
+ text += page.extract_text()
18
+ return text
19
+
20
+ # 2. Build Hierarchical Tree
21
+ def build_hierarchical_tree(text, textbook_title):
22
+ """Create a hierarchical tree structure."""
23
+ lines = text.split("\n")
24
+ tree = {"title": textbook_title, "chapters": []}
25
+ current_chapter = None
26
+
27
+ for line in lines:
28
+ if line.strip().startswith("Chapter"):
29
+ current_chapter = {"title": line.strip(), "sections": []}
30
+ tree["chapters"].append(current_chapter)
31
+ elif current_chapter and line.strip():
32
+ current_chapter["sections"].append(line.strip())
33
+ return tree
34
+
35
+ def save_tree(tree, path):
36
+ """Save the hierarchical tree."""
37
+ with open(path, "w") as f:
38
+ json.dump(tree, f, indent=4)
39
+
40
+ # 3. Hybrid Retrieval
41
+ def hybrid_retrieval(query, openai_api_key):
42
+ """Retrieve relevant text using hybrid methods."""
43
+ with open("hierarchical_trees/example_tree.json") as f: # Adjust file path as needed
44
+ tree = json.load(f)
45
+
46
+ all_sections = [
47
+ section for chapter in tree["chapters"] for section in chapter["sections"]
48
+ ]
49
+ query_embedding = model.encode(query, convert_to_tensor=True)
50
+ section_embeddings = model.encode(all_sections, convert_to_tensor=True)
51
+ similarities = util.pytorch_cos_sim(query_embedding, section_embeddings)
52
+
53
+ top_indices = similarities[0].topk(3).indices.tolist()
54
+ return " ".join([all_sections[i] for i in top_indices])
55
+
56
+ # 4. RAG Answer Generation
57
+ def rag_answer(query, context, openai_api_key):
58
+ """Generate an answer using Retrieval-Augmented Generation."""
59
+ openai.api_key = openai_api_key
60
+ response = openai.Completion.create(
61
+ engine="text-davinci-003",
62
+ prompt=f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:",
63
+ max_tokens=150,
64
+ )
65
+ return response.choices[0].text.strip()