File size: 3,227 Bytes
8e70e09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import json
import requests
from pymilvus import MilvusClient, DataType, Schema, Collection, utility
from dotenv import load_dotenv

load_dotenv()

VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY")
ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT")
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")

def convert_pdf_to_json(file_path):
    url = "https://api.vertopal.com/v1/convert/file"
    headers = {
        "Authorization": f"Bearer {VERTOPAL_API_KEY}"
    }
    data = {
        "app": "[APP_ID]",
        "parameters": {
            "output": "json"
        }
    }
    files = {
        "file": open(file_path, "rb")
    }
    
    response = requests.post(url, headers=headers, data=data, files=files)
    response.raise_for_status()
    
    json_data = response.json()
    return json_data["result"]["output"]["connector"]

def download_json_file(connector):
    url = "https://api.vertopal.com/v1/download/url/get"
    headers = {
        "Authorization": f"Bearer {VERTOPAL_API_KEY}"
    }
    data = {
        "app": "[APP_ID]",
        "connector": connector
    }
    
    response = requests.post(url, headers=headers, data=data)
    response.raise_for_status()
    
    json_data = response.json()
    return json_data

def create_milvus_client_and_collection(collection_name):
    client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN)
    if utility.has_collection(collection_name):
        collection = Collection(collection_name)
    else:
        schema = Schema(enable_dynamic_field=True, description="")
        schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False)
        schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535)
        
        collection = client.create_collection(collection_name, schema=schema)
    return client, collection

def upload_json_to_milvus(json_data, collection_name):
    client, collection = create_milvus_client_and_collection(collection_name)
    data = [
        (len(collection), json.dumps(json_data))
    ]
    collection.insert(data)

def process_pdfs(directory):
    file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        connector = convert_pdf_to_json(file_path)
        json_data = download_json_file(connector)
        upload_json_to_milvus(json_data, "pdf_json_collection")
        print(f"Uploaded JSON data for file: {file_path}")

def upload_persona_json(file_path):
    with open(file_path, "r") as f:
        persona_json = json.load(f)
    upload_json_to_milvus(persona_json, "persona_collection")
    print("Uploaded persona JSON to Milvus")

if __name__ == "__main__":
    pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs"
    process_pdfs(pdf_directory)
    
    persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json"
    upload_persona_json(persona_json_path)