Spaces:
Runtime error
Runtime error
from pdfminer.high_level import extract_text | |
def extract_pdf_text(file_path): | |
return extract_text(file_path) | |
#from google.colab import drive | |
import zipfile | |
import os | |
# Path to the uploaded zip file | |
zip_file_path = './data.zip' | |
extract_folder = './data' | |
# Unzip the file if the directory does not already exist | |
if not os.path.exists(extract_folder): | |
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: | |
zip_ref.extractall(extract_folder) | |
import os | |
pdf_folder = './data' | |
# List all PDF files in the directory | |
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')] | |
# Process each PDF file | |
for pdf_file in pdf_files: | |
pdf_path = os.path.join(pdf_folder, pdf_file) | |
# Your code to process PDF | |
#drive.mount('/content/drive') | |
#import os | |
#pdf_folder = '/content/drive/MyDrive' | |
pdf_texts = [] | |
for pdf_file in os.listdir(pdf_folder): | |
if pdf_file.endswith('.pdf'): | |
full_path = os.path.join(pdf_folder, pdf_file) | |
pdf_texts.append(extract_pdf_text(full_path)) | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
chunks = [text_splitter.split_text(text) for text in pdf_texts] | |
chunks = [chunk for sublist in chunks for chunk in sublist] # Flatten list | |
from sentence_transformers import SentenceTransformer | |
import torch | |
import torch.nn.functional as F | |
# Load the model | |
model = SentenceTransformer('all-MiniLM-L6-v2') # You can replace this with another model if needed | |
# Define your text chunks | |
chunks = [ | |
'Sample text chunk 1', | |
'Sample text chunk 2', | |
'Sample text chunk 3' | |
] | |
# Get embeddings for the text chunks | |
embeddings = model.encode(chunks) | |
# Convert embeddings to tensor and normalize | |
embeddings_tensor = torch.tensor(embeddings) | |
embeddings_tensor = F.normalize(embeddings_tensor, p=2, dim=-1) | |
# Print the embeddings tensor | |
print(embeddings_tensor) | |
import faiss | |
import numpy as np | |
dimension = len(embeddings[0]) | |
index = faiss.IndexFlatL2(dimension) | |
index.add(np.array(embeddings)) | |
def retrieve(query, k=5): | |
query_embedding = embedding_model.embed_query(query) | |
distances, indices = index.search(np.array([query_embedding]), k) | |
return [chunks[i] for i in indices[0]] | |
import os | |
from groq import Groq | |
# Set up the API key | |
os.environ["GROQ_API_KEY"] = "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" | |
# Initialize the Groq client | |
client = Groq(api_key=os.environ["GROQ_API_KEY"]) | |
# Define the chat completion request | |
def generate_response(prompt): | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "user", | |
"content": prompt, | |
} | |
], | |
model="llama3-8b-8192", # Replace with your specific model if needed | |
) | |
return chat_completion.choices[0].message.content | |
# Example prompt | |
prompt = "Explain the importance of fast language models" | |
response = generate_response(prompt) | |
# Print the response | |
print(response) | |
import gradio as gr | |
def gradio_chatbot(query): | |
return generate_response(query) | |
gr.Interface(fn=gradio_chatbot, inputs="text", outputs="text").launch() | |