Spaces:

ParitKansal
/

tempAutoScraping

Sleeping

File size: 4,364 Bytes

f96e5ac

import os
import re
import json
from groq import Groq

def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']):
    """
    Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file.

    Parameters:
        input_file_path (str): Path to the input file containing data.
        output_file_path (str): Path to the output JSON file.
        api_key (str): Groq API key for authentication.
        chunk_size (int): Size of each chunk of data.
        overlap_size (int): Number of characters to overlap between chunks.
        model (str): Model identifier to use for processing.
        about (str): Description of the data for the model.
        details (list or str): List of column names expected in the output JSON or a specific condition.
    """
    # Initialize the Groq client with the provided API key
    client = Groq(api_key=api_key)

    def read_file_in_chunks(file_path, chunk_size, overlap_size):
        """Reads the file in chunks of a specified size with overlapping."""
        with open(file_path, 'r', encoding='utf-8') as f:
            buffer = f.read()
            start = 0
            while start < len(buffer):
                end = start + chunk_size
                yield buffer[start:end]
                start = end - overlap_size  # Move the start point for the next chunk

    def extract_text_between_braces(text):
        """Extracts and returns all text between curly braces."""
        matches = re.findall(r'\{.*?\}', text, re.DOTALL)
        return matches

    def ensure_strings_in_json(data):
        """Ensure that all values in JSON are strings."""
        if isinstance(data, dict):
            return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()}
        elif isinstance(data, list):
            return [ensure_strings_in_json(item) for item in data]
        return str(data)

    def process_chunk(client, chunk, model, about, details):
        """Sends a chunk to the model and returns the completion."""
        
        system_message = (
            f"You are a helpful assistant for cleaning and organizing the data. \n"
            f"This data is about {about}. \n"
            f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n"
            f"Do not add extra details apart from JSON.\n"
            f"If there is no such data, return an empty list.\n"
        )
        
        completion = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": system_message
                },
                {
                    "role": "user",
                    "content": chunk
                }
            ],
            temperature=1,
            max_tokens=8192,
            top_p=1,
            stream=False,
            stop=None,
        )
        
        # Accessing the message content using dot notation
        return completion.choices[0].message.content

    combined_output = []

    # Read and process the file in chunks
    for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size):
        output = process_chunk(client, chunk, model, about, details)
        # Extract all text between curly braces from each chunk output
        brace_texts = extract_text_between_braces(output)
        for brace_text in brace_texts:
            try:
                # Parse JSON and ensure all values are strings
                json_data = json.loads(brace_text)
                json_data = ensure_strings_in_json(json_data)
                combined_output.append(json_data)
            except json.JSONDecodeError:
                print("+++++++++++++++++++++++++++++++++++++++++++++++++")
                print("Invalid JSON format in extracted text:")
                print(brace_text)
                print("+++++++++++++++++++++++++++++++++++++++++++++++++")

    # Output the combined result to a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(combined_output, f, indent=4)

    print(f"Processing complete. Output saved to '{output_file_path}'.")