Spaces:

Red-Tech-Hub
/

CodeVulnerabilityAI

Sleeping

App Files Files Community

Red-tech-hub commited on Jun 17

Commit

19353ca

•

1 Parent(s): 442880a

[update] new vectores

Browse files

Files changed (11) hide show

.env +3 -1
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/data_level0.bin +2 -2
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/header.bin +1 -1
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/index_metadata.pickle +1 -1
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/length.bin +1 -1
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/link_lists.bin +1 -1
chroma_db/chroma.sqlite3 +2 -2
finetune.py +7 -7
finetunePinecone.py +112 -0
requirements.txt +3 -2
run.py +5 -5

.env CHANGED Viewed

@@ -1,2 +1,4 @@
 TRANSFORMERS_CACHE=/code/model/cache
-HF_HUB_DISABLE_SYMLINKS_WARNING=true

 TRANSFORMERS_CACHE=/code/model/cache
+HF_HUB_DISABLE_SYMLINKS_WARNING=true
+PINECONE_API_KEY="04e7b9a8-4d29-4c1a-a4bd-f61d84cbbc58"
+HF_HOME=/code/model/cache

chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/data_level0.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e61ddecf856f7bfc716cdcf4c732fd6e919f653f5f0bcfbaff15e8d0d10ad097
-size 3212000

 version https://git-lfs.github.com/spec/v1
+oid sha256:c43c0fbe34b585fd92affd208ed4762d98c01ed73533d075ce449ef6c622c872
+size 1676000

chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/header.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdb00e89b6ee7733fd37556b1da3447d9895ad7431512096c0e073ed667a25d0
 size 100

 version https://git-lfs.github.com/spec/v1
+oid sha256:35e84f099c65ade720d9a85b056a1619549b039e5ec79157f87d43bb6918187f
 size 100

chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/index_metadata.pickle RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67d02ffeefff7683db7f9af38d1ea492ebfa8f648f20ad28562938189ebf4f8b
 size 55974

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd6a6304deeab5d34b507b65b3b8d295efe81f2a272f8ad148f7d78cc578f679
 size 55974

chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/length.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7e634c1869715e5681fdf53cba91ef68fe057bdffe0330374fd92b4db85540e
 size 4000

 version https://git-lfs.github.com/spec/v1
+oid sha256:dee53a6241ae7881c9ab2e4e091f69b64ef544e12c427f9278da1e9b5b9c93c5
 size 4000

chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 → 20a0199b-8b35-420d-a98b-6310dae9461f}/link_lists.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:903583451dafbf32b09d933ad25705452073842325e3f3caddc8a4382f8fb655
 size 8624

 version https://git-lfs.github.com/spec/v1
+oid sha256:21cb3111a44e08a70ca4290809114061e0a51f7f7a0a22afc52280db70942449
 size 8624

chroma_db/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a891484ada865ece4c707faf558848011741f497d5431653122592f0258d46f4
-size 23683072

 version https://git-lfs.github.com/spec/v1
+oid sha256:a1c050a90eba5d3d16210b23d6b8578fc53b8b77eab8b19ddd3f4c81910dae16
+size 21774336

finetune.py CHANGED Viewed

@@ -8,15 +8,15 @@ from transformers import AutoModelForCausalLM
 load_dotenv()
-ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
-    model_type='llama',
-    max_new_tokens = 10960,
-    threads = 3,
-)
 csv_files = []
 root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-cve_csv_path = os.path.join(root_dir, 'data\\cve')
 csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
@@ -36,7 +36,7 @@ chroma_db_directory = str("chroma_db/")
 client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
-collection = client.get_or_create_collection(name="CVE", embedding_function=ollama_ef)
 documents_to_add = []
 ids_to_add = []

 load_dotenv()
+# ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
+#     model_type='llama',
+#     max_new_tokens = 10960,
+#     threads = 3,
+# )
 csv_files = []
 root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+cve_csv_path = os.path.join(root_dir, 'codevulnerabilityai\\data\\cve')
 csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
 client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
+collection = client.get_or_create_collection(name="CVE")
 documents_to_add = []
 ids_to_add = []

finetunePinecone.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import uuid
+import chromadb
+import pandas as pd
+import os
+from dotenv import load_dotenv
+import json
+from transformers import AutoModelForCausalLM
+from pinecone.grpc import PineconeGRPC as Pinecone
+from pinecone import ServerlessSpec
+load_dotenv()
+ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
+    model_type='llama',
+    max_new_tokens = 10960,
+    threads = 3,
+)
+csv_files = []
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+cve_csv_path = os.path.join(root_dir, 'data\\cve')
+csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
+dtype_dict = {
+    'Name': str,
+    'Status': str,
+    'Description': str,
+    'References': str,
+    'Phase': str,
+    'Votes': str,
+    'Comments': str
+}
+pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
+chroma_data_path = str(os.getenv('CHROMA_DATA_PATH'))
+chroma_db_directory = str("chroma_db/")
+client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
+collection = client.get_or_create_collection(name="CVE", embedding_function=ollama_ef)
+index_name = "code-vulnerability-ai"
+documents_to_add = []
+ids_to_add = []
+metadata_to_add = []
+documents_to_add_string = []
+batch_size = 10
+current_batch = 0
+if csv_files:
+    for csv_file in csv_files:
+        print(f"Processing {csv_file}...")
+        df = pd.read_csv(csv_file, on_bad_lines='skip', dtype=dtype_dict)
+        documents = df['Description'].fillna('').astype(str).tolist()
+        if not df.empty and 'Description' in df.columns:
+            for index, row in df.iterrows():
+                metadata_parts = row['Name'].split(';')
+                metadata = {
+                    "Name": str(metadata_parts[0].strip()),
+                    "Status": str(metadata_parts[1].strip()) if len(metadata_parts) > 1 else "",
+                    "Description": str(metadata_parts[2].strip()) if len(metadata_parts) > 2 else "",
+                    "References": str(metadata_parts[3].strip()) if len(metadata_parts) > 3 else "",
+                    "Phase": str(metadata_parts[4].strip()) if len(metadata_parts) > 4 else "",
+                    "Votes": str(metadata_parts[5].strip()) if len(metadata_parts) > 5 else "",
+                }
+                document_id = str(uuid.uuid4())
+                document_content = metadata["Description"]
+                document = {'id': document_id, 'content': document_content}
+                documents_to_add.append(document)
+                documents_to_add_string.append(json.dumps(documents_to_add))
+                ids_to_add.append(document_id)
+                metadata_to_add.append(metadata)
+                current_batch += 1
+                if current_batch % batch_size == 0:
+                    print(f"Batch {current_batch // batch_size} added to the collection.")
+                    collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
+                    documents_to_add = []
+                    ids_to_add = []
+                    metadata_to_add = []
+                    documents_to_add_string = []
+                    print(f"Batch {current_batch // batch_size} completed.")
+        else:
+            print(f"Skipping file {csv_file} due to empty DataFrame or missing 'Description' column")
+else:
+    print("No CSV files found in the directory. Skipping processing.")
+# Add the remaining documents if there are less than 100 left
+if documents_to_add:
+    print(f"Adding remaining {len(documents_to_add)} documents to the collection.")
+    collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
+# results = collection.query(
+#     query_texts=["Dotnet"],
+#     n_results=3,
+# )
+# print(results)

requirements.txt CHANGED Viewed

@@ -10,7 +10,8 @@ langchain==0.1.11
 langchain_core==0.1.48
 langchain_community==0.0.36
 langserve==0.1.1
-chromadb==0.4.24
 starlette==0.37.2
 typer==0.10.0
-sentence-transformers

 langchain_core==0.1.48
 langchain_community==0.0.36
 langserve==0.1.1
+chromadb==0.5
 starlette==0.37.2
 typer==0.10.0
+sentence-transformers
+pinecone-client

run.py CHANGED Viewed

@@ -11,14 +11,14 @@ os.environ['TRANSFORMERS_CACHE'] = '/code/model/cache/'
 model_kwargs = {'trust_remote_code': True}
-embedding = HuggingFaceEmbeddings(
-    model_name="nomic-ai/nomic-embed-text-v1.5",
-    model_kwargs=model_kwargs
-)
 db = Chroma(
     persist_directory="./chroma_db",
-    embedding_function=embedding,
     collection_name='CVE'
 )

 model_kwargs = {'trust_remote_code': True}
+# embedding = HuggingFaceEmbeddings(
+#     model_name="nomic-ai/nomic-embed-text-v1.5",
+#     model_kwargs=model_kwargs
+# )
 db = Chroma(
     persist_directory="./chroma_db",
+    # embedding_function=embedding,
     collection_name='CVE'
 )