| | import os |
| | import json |
| | import pdfplumber |
| | from groq import Groq |
| | import streamlit as st |
| |
|
| | |
| | def pdf_to_text(pdf_path): |
| | with pdfplumber.open(pdf_path) as pdf: |
| | text = "" |
| | for page in pdf.pages: |
| | text += page.extract_text() |
| | return text |
| |
|
| | |
| | def text_to_json(text): |
| | paragraphs = text.split("\n\n") |
| | json_data = {"dataset": [{"section": i + 1, "content": para} for i, para in enumerate(paragraphs)]} |
| | return json_data |
| |
|
| | |
| | def restrict_to_pdf_query(query, dataset): |
| | relevant_content = [] |
| | query_lower = query.lower() |
| |
|
| | for section in dataset["dataset"]: |
| | section_content = section["content"].lower() |
| | |
| | if query_lower in section_content: |
| | relevant_content.append(section["content"]) |
| |
|
| | return relevant_content if relevant_content else ["No relevant content found."] |
| |
|
| | |
| | def split_text_into_chunks(text, max_tokens=2000): |
| | |
| | chunks = [] |
| | current_chunk = "" |
| | |
| | for paragraph in text.split("\n"): |
| | |
| | if len(current_chunk.split()) + len(paragraph.split()) > max_tokens: |
| | chunks.append(current_chunk) |
| | current_chunk = paragraph |
| | else: |
| | current_chunk += "\n" + paragraph |
| | |
| | if current_chunk: |
| | chunks.append(current_chunk) |
| | |
| | return chunks |
| |
|
| | |
| | pdf_path = "PAKISTAN PENAL CODE.pdf" |
| | pdf_text = pdf_to_text(pdf_path) |
| | dataset_json = text_to_json(pdf_text) |
| |
|
| | |
| | with open("dataset.json", "w") as f: |
| | json.dump(dataset_json, f, indent=4) |
| |
|
| | |
| | client = Groq( |
| | api_key=os.environ.get("GROQ_API_KEY"), |
| | ) |
| |
|
| | |
| | st.title("RAG App Using Groq API") |
| | user_query = st.text_input("Ask a question:") |
| |
|
| | if user_query: |
| | |
| | with open("dataset.json", "r") as f: |
| | dataset = json.load(f) |
| |
|
| | |
| | pdf_based_answer = restrict_to_pdf_query(user_query, dataset) |
| |
|
| | if pdf_based_answer[0] != "No relevant content found.": |
| | |
| | relevant_text = "\n".join(pdf_based_answer) |
| |
|
| | |
| | chunks = split_text_into_chunks(relevant_text) |
| |
|
| | |
| | if chunks: |
| | |
| | prompt = f"""You are a Pakistani lawyer. Answer the following query based on the Pakistan Penal Code, explaining it in a professional and detailed manner, including references to specific sections of the code when applicable. If the information is found in the dataset, provide it accordingly. Query: "{user_query}"\nAnswer: {chunks[0]}""" |
| | |
| | |
| | chat_completion = client.chat.completions.create( |
| | messages=[ |
| | { |
| | "role": "user", |
| | "content": prompt, |
| | } |
| | ], |
| | model="llama3-groq-70b-8192-tool-use-preview", |
| | ) |
| |
|
| | |
| | st.write(chat_completion.choices[0].message.content) |
| | else: |
| | st.write("Error: Unable to process content into chunks.") |
| | else: |
| | st.write("No relevant content found in the PDF dataset.") |
| |
|