Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import gradio as gr | |
from PIL import Image | |
from pdf2image import convert_from_path | |
import pytesseract | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.memory import ConversationBufferMemory | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import RetrievalQA | |
from langchain_groq import ChatGroq | |
class ChatbotModel: | |
def __init__(self): | |
os.environ["GROQ_API_KEY"] = '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' | |
self.embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs={'normalize_embeddings': True} | |
) | |
self.llm = ChatGroq( | |
model='llama3-70b-8192', | |
temperature=0.5, | |
max_tokens=None, | |
timeout=None, | |
max_retries=2, | |
) | |
self.memory = ConversationBufferMemory(memory_key="history", input_key="question") | |
self.template = """You are an intelligent assistant... (Rest of your prompt as is)""" | |
self.QA_CHAIN_PROMPT = PromptTemplate( | |
input_variables=["history", "context", "question"], | |
template=self.template | |
) | |
self.db1 = None | |
self.qa_chain = None | |
def ocr_image(self, image_path, language='eng+guj'): | |
img = Image.open(image_path) | |
return pytesseract.image_to_string(img, lang=language) | |
def ocr_pdf(self, pdf_path, language='eng+guj'): | |
images = convert_from_path(pdf_path) | |
return "\n".join([pytesseract.image_to_string(img, lang=language) for img in images]) | |
def process_file(self, uploaded_file): | |
_, file_extension = os.path.splitext(uploaded_file.name) | |
file_extension = file_extension.lower() | |
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file: | |
temp_file.write(uploaded_file.read()) | |
temp_path = temp_file.name | |
if file_extension == '.pdf': | |
raw_text = self.ocr_pdf(temp_path, language='guj+eng') | |
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']: | |
raw_text = self.ocr_image(temp_path, language='guj+eng') | |
else: | |
return "Unsupported file format." | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
text_chunks = text_splitter.split_text(raw_text) | |
self.db1 = FAISS.from_documents(text_chunks, self.embeddings) | |
self.qa_chain = RetrievalQA.from_chain_type( | |
self.llm, | |
retriever=self.db1.as_retriever(), | |
chain_type='stuff', | |
verbose=True, | |
chain_type_kwargs={ | |
"verbose": True, | |
"prompt": self.QA_CHAIN_PROMPT, | |
"memory": self.memory | |
} | |
) | |
return "File processed successfully!" | |
def get_response(self, user_input): | |
if not self.qa_chain: | |
return "Please upload and process a file before asking questions." | |
response = self.qa_chain({"query": user_input}) | |
return response["result"] | |
chatbot = ChatbotModel() | |
def upload_and_process(file): | |
return chatbot.process_file(file) | |
def ask_question(question): | |
return chatbot.get_response(question) | |
interface = gr.Blocks() | |
with interface: | |
gr.Markdown("# Educational Chatbot with Document Analysis") | |
with gr.Row(): | |
file_upload = gr.File(label="Upload PDF or Image") | |
upload_btn = gr.Button("Process File") | |
output = gr.Textbox(label="File Processing Status") | |
with gr.Row(): | |
question_box = gr.Textbox(label="Ask a Question") | |
ask_btn = gr.Button("Submit") | |
answer = gr.Textbox(label="Answer") | |
upload_btn.click(upload_and_process, inputs=file_upload, outputs=output) | |
ask_btn.click(ask_question, inputs=question_box, outputs=answer) | |
interface.launch() | |