Spaces:

chagu13
/

chagu-demo

Running

App Files Files Community

talexm commited on Nov 16

Commit

595bead

•

1 Parent(s): 9a25cef

adding model for sec query

Browse files

Files changed (2) hide show

.gitignore +4 -0
rag_sec/rag_chagu_demo.py +14 -12

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ rag_sec/__pycache*
2	+
3	+ rag_sec/__pycache__/rag_chagu_demo.*
4	+

rag_sec/rag_chagu_demo.py CHANGED Viewed

@@ -1,17 +1,17 @@
 import os
 from pathlib import Path
 from difflib import get_close_matches
 class DocumentSearcher:
     def __init__(self):
         self.documents = []
-        self.malicious_patterns = ["DROP TABLE", "SELECT *", "INSERT INTO", "DELETE FROM", "--", ";"]
     def load_imdb_data(self):
-        # Define the dataset path using the HOME environment variable
-        home_dir = Path(os.getenv("HOME", "/"))  # Fallback to root if HOME is not set
         data_dir = home_dir / "data-sets/aclImdb/train"
         pos_dir = data_dir / "pos"
         neg_dir = data_dir / "neg"
@@ -23,12 +23,10 @@ class DocumentSearcher:
         if not neg_dir.exists() or not any(neg_dir.iterdir()):
             print("No negative reviews found.")
-        # Load positive reviews
         for filename in pos_dir.iterdir():
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
-        # Load negative reviews
         for filename in neg_dir.iterdir():
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
@@ -44,7 +42,6 @@ class DocumentSearcher:
             print("No .txt files directory found.")
             return
-        # Load all .txt files
         for filename in txt_dir.glob("*.txt"):
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
@@ -52,15 +49,20 @@ class DocumentSearcher:
         print(f"Loaded additional {len(self.documents)} documents from .txt files.")
     def is_query_malicious(self, query):
-        for pattern in self.malicious_patterns:
-            if pattern.lower() in query.lower():
-                print(f"Warning: Malicious query detected - {pattern}")
-                return True
         return False
     def search_documents(self, query):
         if self.is_query_malicious(query):
-            return [{"document": "ANOMALY: Query blocked due to detected malicious content.", "similarity": 0.0}]
         # Use fuzzy matching for normal queries
         matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)

 import os
 from pathlib import Path
 from difflib import get_close_matches
+from transformers import pipeline
 class DocumentSearcher:
     def __init__(self):
         self.documents = []
+        # Load a pre-trained model for malicious intent detection
+        self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
     def load_imdb_data(self):
+        home_dir = Path(os.getenv("HOME", "/"))
         data_dir = home_dir / "data-sets/aclImdb/train"
         pos_dir = data_dir / "pos"
         neg_dir = data_dir / "neg"
         if not neg_dir.exists() or not any(neg_dir.iterdir()):
             print("No negative reviews found.")
         for filename in pos_dir.iterdir():
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
         for filename in neg_dir.iterdir():
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
             print("No .txt files directory found.")
             return
         for filename in txt_dir.glob("*.txt"):
             with open(filename, "r", encoding="utf-8") as file:
                 self.documents.append(file.read())
         print(f"Loaded additional {len(self.documents)} documents from .txt files.")
     def is_query_malicious(self, query):
+        # Use the pre-trained model to check if the query has malicious intent
+        result = self.malicious_detector(query)[0]
+        label = result['label']
+        score = result['score']
+        # Consider the query malicious if the sentiment is negative with high confidence
+        if label == "NEGATIVE" and score > 0.8:
+            print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
+            return True
         return False
     def search_documents(self, query):
         if self.is_query_malicious(query):
+            return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
         # Use fuzzy matching for normal queries
         matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)