Spaces:
Runtime error
Runtime error
File size: 3,227 Bytes
8e70e09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import json
import requests
from pymilvus import MilvusClient, DataType, Schema, Collection, utility
from dotenv import load_dotenv
load_dotenv()
VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY")
ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT")
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")
def convert_pdf_to_json(file_path):
url = "https://api.vertopal.com/v1/convert/file"
headers = {
"Authorization": f"Bearer {VERTOPAL_API_KEY}"
}
data = {
"app": "[APP_ID]",
"parameters": {
"output": "json"
}
}
files = {
"file": open(file_path, "rb")
}
response = requests.post(url, headers=headers, data=data, files=files)
response.raise_for_status()
json_data = response.json()
return json_data["result"]["output"]["connector"]
def download_json_file(connector):
url = "https://api.vertopal.com/v1/download/url/get"
headers = {
"Authorization": f"Bearer {VERTOPAL_API_KEY}"
}
data = {
"app": "[APP_ID]",
"connector": connector
}
response = requests.post(url, headers=headers, data=data)
response.raise_for_status()
json_data = response.json()
return json_data
def create_milvus_client_and_collection(collection_name):
client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN)
if utility.has_collection(collection_name):
collection = Collection(collection_name)
else:
schema = Schema(enable_dynamic_field=True, description="")
schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False)
schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535)
collection = client.create_collection(collection_name, schema=schema)
return client, collection
def upload_json_to_milvus(json_data, collection_name):
client, collection = create_milvus_client_and_collection(collection_name)
data = [
(len(collection), json.dumps(json_data))
]
collection.insert(data)
def process_pdfs(directory):
file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]
for file_path in file_paths:
print(f"Processing file: {file_path}")
connector = convert_pdf_to_json(file_path)
json_data = download_json_file(connector)
upload_json_to_milvus(json_data, "pdf_json_collection")
print(f"Uploaded JSON data for file: {file_path}")
def upload_persona_json(file_path):
with open(file_path, "r") as f:
persona_json = json.load(f)
upload_json_to_milvus(persona_json, "persona_collection")
print("Uploaded persona JSON to Milvus")
if __name__ == "__main__":
pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs"
process_pdfs(pdf_directory)
persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json"
upload_persona_json(persona_json_path) |