Spaces:
Sleeping
Sleeping
Red-tech-hub
commited on
Commit
β’
19353ca
1
Parent(s):
442880a
[update] new vectores
Browse files- .env +3 -1
- chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/data_level0.bin +2 -2
- chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/header.bin +1 -1
- chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/index_metadata.pickle +1 -1
- chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/length.bin +1 -1
- chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/link_lists.bin +1 -1
- chroma_db/chroma.sqlite3 +2 -2
- finetune.py +7 -7
- finetunePinecone.py +112 -0
- requirements.txt +3 -2
- run.py +5 -5
.env
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
TRANSFORMERS_CACHE=/code/model/cache
|
2 |
-
HF_HUB_DISABLE_SYMLINKS_WARNING=true
|
|
|
|
|
|
1 |
TRANSFORMERS_CACHE=/code/model/cache
|
2 |
+
HF_HUB_DISABLE_SYMLINKS_WARNING=true
|
3 |
+
PINECONE_API_KEY="04e7b9a8-4d29-4c1a-a4bd-f61d84cbbc58"
|
4 |
+
HF_HOME=/code/model/cache
|
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/data_level0.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c43c0fbe34b585fd92affd208ed4762d98c01ed73533d075ce449ef6c622c872
|
3 |
+
size 1676000
|
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/header.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 100
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35e84f099c65ade720d9a85b056a1619549b039e5ec79157f87d43bb6918187f
|
3 |
size 100
|
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/index_metadata.pickle
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 55974
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd6a6304deeab5d34b507b65b3b8d295efe81f2a272f8ad148f7d78cc578f679
|
3 |
size 55974
|
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/length.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4000
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dee53a6241ae7881c9ab2e4e091f69b64ef544e12c427f9278da1e9b5b9c93c5
|
3 |
size 4000
|
chroma_db/{2b8d0645-bd23-4864-96bb-3d8f4fa77263 β 20a0199b-8b35-420d-a98b-6310dae9461f}/link_lists.bin
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 8624
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:21cb3111a44e08a70ca4290809114061e0a51f7f7a0a22afc52280db70942449
|
3 |
size 8624
|
chroma_db/chroma.sqlite3
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1c050a90eba5d3d16210b23d6b8578fc53b8b77eab8b19ddd3f4c81910dae16
|
3 |
+
size 21774336
|
finetune.py
CHANGED
@@ -8,15 +8,15 @@ from transformers import AutoModelForCausalLM
|
|
8 |
|
9 |
load_dotenv()
|
10 |
|
11 |
-
ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
)
|
16 |
|
17 |
csv_files = []
|
18 |
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
19 |
-
cve_csv_path = os.path.join(root_dir, 'data\\cve')
|
20 |
|
21 |
csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
|
22 |
|
@@ -36,7 +36,7 @@ chroma_db_directory = str("chroma_db/")
|
|
36 |
|
37 |
client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
|
38 |
|
39 |
-
collection = client.get_or_create_collection(name="CVE"
|
40 |
|
41 |
documents_to_add = []
|
42 |
ids_to_add = []
|
|
|
8 |
|
9 |
load_dotenv()
|
10 |
|
11 |
+
# ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
|
12 |
+
# model_type='llama',
|
13 |
+
# max_new_tokens = 10960,
|
14 |
+
# threads = 3,
|
15 |
+
# )
|
16 |
|
17 |
csv_files = []
|
18 |
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
19 |
+
cve_csv_path = os.path.join(root_dir, 'codevulnerabilityai\\data\\cve')
|
20 |
|
21 |
csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
|
22 |
|
|
|
36 |
|
37 |
client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
|
38 |
|
39 |
+
collection = client.get_or_create_collection(name="CVE")
|
40 |
|
41 |
documents_to_add = []
|
42 |
ids_to_add = []
|
finetunePinecone.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
import chromadb
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import json
|
7 |
+
from transformers import AutoModelForCausalLM
|
8 |
+
|
9 |
+
from pinecone.grpc import PineconeGRPC as Pinecone
|
10 |
+
from pinecone import ServerlessSpec
|
11 |
+
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
ollama_ef = AutoModelForCausalLM.from_pretrained("nomic-embed-text-v1.5.Q5_K_S.gguf",
|
16 |
+
model_type='llama',
|
17 |
+
max_new_tokens = 10960,
|
18 |
+
threads = 3,
|
19 |
+
)
|
20 |
+
|
21 |
+
csv_files = []
|
22 |
+
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
23 |
+
cve_csv_path = os.path.join(root_dir, 'data\\cve')
|
24 |
+
|
25 |
+
csv_files.extend([os.path.join(cve_csv_path, f) for f in os.listdir(cve_csv_path) if f.endswith('.csv')])
|
26 |
+
|
27 |
+
dtype_dict = {
|
28 |
+
'Name': str,
|
29 |
+
'Status': str,
|
30 |
+
'Description': str,
|
31 |
+
'References': str,
|
32 |
+
'Phase': str,
|
33 |
+
'Votes': str,
|
34 |
+
'Comments': str
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
|
39 |
+
|
40 |
+
chroma_data_path = str(os.getenv('CHROMA_DATA_PATH'))
|
41 |
+
|
42 |
+
chroma_db_directory = str("chroma_db/")
|
43 |
+
|
44 |
+
client = chromadb.PersistentClient(path=os.path.join(chroma_data_path, chroma_db_directory))
|
45 |
+
|
46 |
+
collection = client.get_or_create_collection(name="CVE", embedding_function=ollama_ef)
|
47 |
+
|
48 |
+
index_name = "code-vulnerability-ai"
|
49 |
+
|
50 |
+
documents_to_add = []
|
51 |
+
ids_to_add = []
|
52 |
+
metadata_to_add = []
|
53 |
+
documents_to_add_string = []
|
54 |
+
|
55 |
+
batch_size = 10
|
56 |
+
current_batch = 0
|
57 |
+
|
58 |
+
if csv_files:
|
59 |
+
for csv_file in csv_files:
|
60 |
+
print(f"Processing {csv_file}...")
|
61 |
+
df = pd.read_csv(csv_file, on_bad_lines='skip', dtype=dtype_dict)
|
62 |
+
|
63 |
+
documents = df['Description'].fillna('').astype(str).tolist()
|
64 |
+
|
65 |
+
if not df.empty and 'Description' in df.columns:
|
66 |
+
for index, row in df.iterrows():
|
67 |
+
metadata_parts = row['Name'].split(';')
|
68 |
+
metadata = {
|
69 |
+
"Name": str(metadata_parts[0].strip()),
|
70 |
+
"Status": str(metadata_parts[1].strip()) if len(metadata_parts) > 1 else "",
|
71 |
+
"Description": str(metadata_parts[2].strip()) if len(metadata_parts) > 2 else "",
|
72 |
+
"References": str(metadata_parts[3].strip()) if len(metadata_parts) > 3 else "",
|
73 |
+
"Phase": str(metadata_parts[4].strip()) if len(metadata_parts) > 4 else "",
|
74 |
+
"Votes": str(metadata_parts[5].strip()) if len(metadata_parts) > 5 else "",
|
75 |
+
}
|
76 |
+
document_id = str(uuid.uuid4())
|
77 |
+
|
78 |
+
document_content = metadata["Description"]
|
79 |
+
|
80 |
+
document = {'id': document_id, 'content': document_content}
|
81 |
+
|
82 |
+
documents_to_add.append(document)
|
83 |
+
documents_to_add_string.append(json.dumps(documents_to_add))
|
84 |
+
ids_to_add.append(document_id)
|
85 |
+
metadata_to_add.append(metadata)
|
86 |
+
|
87 |
+
current_batch += 1
|
88 |
+
if current_batch % batch_size == 0:
|
89 |
+
print(f"Batch {current_batch // batch_size} added to the collection.")
|
90 |
+
collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
|
91 |
+
documents_to_add = []
|
92 |
+
ids_to_add = []
|
93 |
+
metadata_to_add = []
|
94 |
+
documents_to_add_string = []
|
95 |
+
print(f"Batch {current_batch // batch_size} completed.")
|
96 |
+
|
97 |
+
else:
|
98 |
+
print(f"Skipping file {csv_file} due to empty DataFrame or missing 'Description' column")
|
99 |
+
else:
|
100 |
+
print("No CSV files found in the directory. Skipping processing.")
|
101 |
+
|
102 |
+
# Add the remaining documents if there are less than 100 left
|
103 |
+
if documents_to_add:
|
104 |
+
print(f"Adding remaining {len(documents_to_add)} documents to the collection.")
|
105 |
+
collection.add(documents=documents_to_add_string, ids=ids_to_add, metadatas=metadata_to_add)
|
106 |
+
|
107 |
+
# results = collection.query(
|
108 |
+
# query_texts=["Dotnet"],
|
109 |
+
# n_results=3,
|
110 |
+
# )
|
111 |
+
|
112 |
+
# print(results)
|
requirements.txt
CHANGED
@@ -10,7 +10,8 @@ langchain==0.1.11
|
|
10 |
langchain_core==0.1.48
|
11 |
langchain_community==0.0.36
|
12 |
langserve==0.1.1
|
13 |
-
chromadb==0.
|
14 |
starlette==0.37.2
|
15 |
typer==0.10.0
|
16 |
-
sentence-transformers
|
|
|
|
10 |
langchain_core==0.1.48
|
11 |
langchain_community==0.0.36
|
12 |
langserve==0.1.1
|
13 |
+
chromadb==0.5
|
14 |
starlette==0.37.2
|
15 |
typer==0.10.0
|
16 |
+
sentence-transformers
|
17 |
+
pinecone-client
|
run.py
CHANGED
@@ -11,14 +11,14 @@ os.environ['TRANSFORMERS_CACHE'] = '/code/model/cache/'
|
|
11 |
|
12 |
model_kwargs = {'trust_remote_code': True}
|
13 |
|
14 |
-
embedding = HuggingFaceEmbeddings(
|
15 |
-
|
16 |
-
|
17 |
-
)
|
18 |
|
19 |
db = Chroma(
|
20 |
persist_directory="./chroma_db",
|
21 |
-
embedding_function=embedding,
|
22 |
collection_name='CVE'
|
23 |
)
|
24 |
|
|
|
11 |
|
12 |
model_kwargs = {'trust_remote_code': True}
|
13 |
|
14 |
+
# embedding = HuggingFaceEmbeddings(
|
15 |
+
# model_name="nomic-ai/nomic-embed-text-v1.5",
|
16 |
+
# model_kwargs=model_kwargs
|
17 |
+
# )
|
18 |
|
19 |
db = Chroma(
|
20 |
persist_directory="./chroma_db",
|
21 |
+
# embedding_function=embedding,
|
22 |
collection_name='CVE'
|
23 |
)
|
24 |
|