ILYA_docs_RAG / preProcessPdfs.py
TheDavidYoungblood
99 additions of files in the repo, 99 additions of files...
8e70e09
raw
history blame contribute delete
No virus
3.23 kB
import os
import json
import requests
from pymilvus import MilvusClient, DataType, Schema, Collection, utility
from dotenv import load_dotenv
load_dotenv()
VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY")
ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT")
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")
def convert_pdf_to_json(file_path):
url = "https://api.vertopal.com/v1/convert/file"
headers = {
"Authorization": f"Bearer {VERTOPAL_API_KEY}"
}
data = {
"app": "[APP_ID]",
"parameters": {
"output": "json"
}
}
files = {
"file": open(file_path, "rb")
}
response = requests.post(url, headers=headers, data=data, files=files)
response.raise_for_status()
json_data = response.json()
return json_data["result"]["output"]["connector"]
def download_json_file(connector):
url = "https://api.vertopal.com/v1/download/url/get"
headers = {
"Authorization": f"Bearer {VERTOPAL_API_KEY}"
}
data = {
"app": "[APP_ID]",
"connector": connector
}
response = requests.post(url, headers=headers, data=data)
response.raise_for_status()
json_data = response.json()
return json_data
def create_milvus_client_and_collection(collection_name):
client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN)
if utility.has_collection(collection_name):
collection = Collection(collection_name)
else:
schema = Schema(enable_dynamic_field=True, description="")
schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False)
schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535)
collection = client.create_collection(collection_name, schema=schema)
return client, collection
def upload_json_to_milvus(json_data, collection_name):
client, collection = create_milvus_client_and_collection(collection_name)
data = [
(len(collection), json.dumps(json_data))
]
collection.insert(data)
def process_pdfs(directory):
file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]
for file_path in file_paths:
print(f"Processing file: {file_path}")
connector = convert_pdf_to_json(file_path)
json_data = download_json_file(connector)
upload_json_to_milvus(json_data, "pdf_json_collection")
print(f"Uploaded JSON data for file: {file_path}")
def upload_persona_json(file_path):
with open(file_path, "r") as f:
persona_json = json.load(f)
upload_json_to_milvus(persona_json, "persona_collection")
print("Uploaded persona JSON to Milvus")
if __name__ == "__main__":
pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs"
process_pdfs(pdf_directory)
persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json"
upload_persona_json(persona_json_path)