Spaces:
Sleeping
Sleeping
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_core.documents import Document | |
| import os | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Load CVE dataset | |
| df = pd.read_csv("cve.csv") | |
| # Set up the embedding model using HuggingFace with a fully qualified model name | |
| # Using a simpler model that's more compatible with Hugging Face Spaces | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller, more compatible model | |
| model_kwargs={'device': 'cpu'} # Ensure it runs on CPU for compatibility | |
| ) | |
| # Directory for the vector store - use /tmp for proper permissions in containerized environments | |
| db_location = "/tmp/chrome_langchain_db" | |
| add_documents = not os.path.exists(db_location) | |
| # Initialize Chroma DB | |
| vector_store = Chroma( | |
| collection_name="cve_data", | |
| persist_directory=db_location, | |
| embedding_function=embeddings | |
| ) | |
| # Add documents only if DB doesn't exist yet | |
| if add_documents: | |
| documents = [] | |
| ids = [] | |
| for i, row in df.iterrows(): | |
| # Replace with actual column names in your CSV | |
| cve_id = row.get("CVE_ID", f"CVE-{i}") | |
| description = row.get("Description", "") | |
| date = row.get("PublishedDate", "") | |
| content = f"CVE ID: {cve_id}\nDescription: {description}\nPublished Date: {date}" | |
| document = Document( | |
| page_content=content, | |
| metadata={"published_date": date}, | |
| id=str(i) | |
| ) | |
| documents.append(document) | |
| ids.append(str(i)) | |
| vector_store.add_documents(documents=documents, ids=ids) | |
| # Create retriever from the vector store | |
| retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |