Spaces:

Razzaqi3143
/

RAG_based_Chatbot_with_multiple_PDFs

Runtime error

File size: 3,171 Bytes

from pdfminer.high_level import extract_text

def extract_pdf_text(file_path):
    return extract_text(file_path)
#from google.colab import drive

import zipfile
import os

# Path to the uploaded zip file
zip_file_path = './data.zip'
extract_folder = './data'

# Unzip the file if the directory does not already exist
if not os.path.exists(extract_folder):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

import os

pdf_folder = './data'

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Process each PDF file
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    # Your code to process PDF


#drive.mount('/content/drive')
#import os

#pdf_folder = '/content/drive/MyDrive'
pdf_texts = []

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith('.pdf'):
        full_path = os.path.join(pdf_folder, pdf_file)
        pdf_texts.append(extract_pdf_text(full_path))
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = [text_splitter.split_text(text) for text in pdf_texts]
chunks = [chunk for sublist in chunks for chunk in sublist]  # Flatten list
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can replace this with another model if needed

# Define your text chunks
chunks = [
    'Sample text chunk 1',
    'Sample text chunk 2',
    'Sample text chunk 3'
]

# Get embeddings for the text chunks
embeddings = model.encode(chunks)

# Convert embeddings to tensor and normalize
embeddings_tensor = torch.tensor(embeddings)
embeddings_tensor = F.normalize(embeddings_tensor, p=2, dim=-1)

# Print the embeddings tensor
print(embeddings_tensor)
import faiss
import numpy as np

dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
def retrieve(query, k=5):
    query_embedding = embedding_model.embed_query(query)
    distances, indices = index.search(np.array([query_embedding]), k)
    return [chunks[i] for i in indices[0]]
import os
from groq import Groq

# Set up the API key
os.environ["GROQ_API_KEY"] = "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"

# Initialize the Groq client
client = Groq(api_key=os.environ["GROQ_API_KEY"])

# Define the chat completion request
def generate_response(prompt):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",  # Replace with your specific model if needed
    )
    return chat_completion.choices[0].message.content

# Example prompt
prompt = "Explain the importance of fast language models"
response = generate_response(prompt)

# Print the response
print(response)
import gradio as gr

def gradio_chatbot(query):
    return generate_response(query)

gr.Interface(fn=gradio_chatbot, inputs="text", outputs="text").launch()