Spaces:
Sleeping
Sleeping
File size: 6,241 Bytes
1552f02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import streamlit as st
import torch
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
import re
import pickle
import fitz # PyMuPDF
from langchain.schema import Document
import langdetect
def clean_output(output: str) -> str:
print("Raw output:", output) # Debugging line
start_index = output.find('[/INST]') + len('[/INST]')
cleaned_output = output[start_index:].strip()
print("Cleaned output:", cleaned_output) # Debugging line
return cleaned_output
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
def split_text_into_paragraphs(text_content):
paragraphs = text_content.split('#')
return [paragraph.strip() for paragraph in paragraphs if paragraph.strip()]
def sanitize_filename(filename):
sanitized_name = re.sub(r'[^a-zA-Z0-9_-]', '_', filename)
return sanitized_name[:63]
def extract_text_from_pdf(pdf_path):
text_content = ''
with fitz.open(pdf_path) as pdf_document:
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text_content += page.get_text()
return text_content
def detect_language(text):
try:
return langdetect.detect(text)
except:
return "en" # Default to English if detection fails
def process_pdf_file(filename, pdf_path, embeddings, llm, prompt):
print(f'\nProcessing: {pdf_path}')
text_content = extract_text_from_pdf(pdf_path)
language = detect_language(text_content)
print(f"Detected language: {language}")
paragraphs = split_text_into_paragraphs(text_content)
documents = [Document(page_content=paragraph, metadata={"language": language, "source": filename}) for paragraph in paragraphs]
print(f"Number of documents created: {len(documents)}")
collection_name = sanitize_filename(os.path.basename(filename))
db = Chroma.from_documents(documents, embeddings, collection_name=collection_name)
retriever = db.as_retriever(search_kwargs={"k": 2})
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={"prompt": prompt},
)
print(f"QA chain created for {filename}")
return qa_chain, language
SYSTEM_PROMPT = """
Use the provided context to answer the question clearly and concisely. Do not repeat the context in your answer.
"""
def generate_prompt(prompt: str, system_prompt: str = SYSTEM_PROMPT) -> str:
return f"""
[INST] <>
{system_prompt}
<>
{prompt} [/INST]
""".strip()
def main():
# Streamlit UI
st.title("PDF-Powered Chatbot")
# File Uploader
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
# Model Loading
model_pickle_path = '/kaggle/working/model.pkl'
if os.path.exists(model_pickle_path):
with open(model_pickle_path, 'rb') as f:
model, tokenizer = pickle.load(f)
else:
MODEL_NAME = "sarvamai/sarvam-2b-v0.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
with open(model_pickle_path, 'wb') as f:
pickle.dump((model, tokenizer), f)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
text_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=1024,
temperature=0.1,
top_p=0.95,
repetition_penalty=1.15,
device=DEVICE
)
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})
template = generate_prompt(
"""
{context}
Question: {question}
""",
system_prompt=SYSTEM_PROMPT,
)
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
# Initialize QA chains dictionary
qa_chains = {}
# Process uploaded files
if uploaded_files:
with st.spinner("Processing PDFs..."):
for uploaded_file in uploaded_files:
file_path = uploaded_file.name # Use the filename directly
qa_chain, doc_language = process_pdf_file(uploaded_file.name, file_path, embeddings, llm, prompt)
qa_chains[doc_language] = (qa_chain, uploaded_file.name)
st.success("PDFs processed! You can now ask questions.")
# Chat interface
if st.button("Clear Chat History"):
st.session_state.chat_history = []
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("Ask your question here"):
st.session_state.chat_history.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.spinner("Generating response..."):
query_language = detect_language(prompt)
if query_language in qa_chains:
qa_chain, _ = qa_chains[query_language]
result = qa_chain({"query": prompt})
cleaned_answer = clean_output(result['result'])
with st.chat_message("assistant"):
st.markdown(cleaned_answer)
st.session_state.chat_history.append({"role": "assistant", "content": cleaned_answer})
else:
with st.chat_message("assistant"):
st.markdown(f"No document available for the detected language: {query_language}")
st.session_state.chat_history.append({"role": "assistant", "content": f"No document available for the detected language: {query_language}"})
if __name__ == "__main__":
main() |