Kazeemkz commited on
Commit
fe4fdc2
1 Parent(s): cb6b522

Add Streamlit application and dependencies

Browse files
Files changed (2) hide show
  1. app.py +66 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import numpy as np
8
+
9
+ # Load the lightweight Hugging Face transformer model
10
+ model_name = "distilbert-base-uncased-distilled-squad"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
13
+ qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer)
14
+
15
+ # Load the SentenceTransformer model for embeddings
16
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
17
+
18
+ # Upload PDF files
19
+ st.header("Question and Answer Chatbot")
20
+
21
+ with st.sidebar:
22
+ st.title("Turn your PDFs into a Q&A session. Upload a file and start asking questions")
23
+ file = st.file_uploader("PDF file upload", type="pdf")
24
+
25
+ # Extract the text
26
+ if file is not None:
27
+ pdf_reader = PdfReader(file)
28
+ text = ""
29
+ for page in pdf_reader.pages:
30
+ text += page.extract_text()
31
+
32
+ # Break it into chunks
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ separators="\n",
35
+ #chunk_size=1000,
36
+ #chunk_overlap=500,
37
+ chunk_size=800,
38
+ chunk_overlap=150,
39
+ length_function=len
40
+ )
41
+ chunks = text_splitter.split_text(text)
42
+
43
+ # Generate embeddings for each chunk
44
+ embeddings = embedding_model.encode(chunks)
45
+
46
+ # Create FAISS index and add embeddings
47
+ dimension = embeddings.shape[1]
48
+ index = faiss.IndexFlatL2(dimension)
49
+ index.add(np.array(embeddings))
50
+
51
+ # Get user question
52
+ user_question = st.text_input("Type your question here")
53
+
54
+ # Perform similarity search
55
+ if user_question:
56
+ question_embedding = embedding_model.encode([user_question])
57
+ D, I = index.search(np.array(question_embedding), k=5)
58
+ matched_texts = [chunks[i] for i in I[0]]
59
+
60
+ # Use the lightweight transformer model for question answering
61
+ response = ""
62
+ for context in matched_texts:
63
+ result = qa_pipeline(question=user_question, context=context)
64
+ response += result['answer'] + " "
65
+
66
+ st.write(response)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ langchain # Ensure this package name is correct based on your actual usage
4
+ transformers
5
+ sentence-transformers
6
+ faiss
7
+ numpy