talexm commited on
Commit
595bead
1 Parent(s): 9a25cef

adding model for sec query

Browse files
Files changed (2) hide show
  1. .gitignore +4 -0
  2. rag_sec/rag_chagu_demo.py +14 -12
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ rag_sec/__pycache*
2
+
3
+ rag_sec/__pycache__/rag_chagu_demo.*
4
+
rag_sec/rag_chagu_demo.py CHANGED
@@ -1,17 +1,17 @@
1
  import os
2
  from pathlib import Path
3
  from difflib import get_close_matches
 
4
 
5
  class DocumentSearcher:
6
  def __init__(self):
7
  self.documents = []
8
- self.malicious_patterns = ["DROP TABLE", "SELECT *", "INSERT INTO", "DELETE FROM", "--", ";"]
 
9
 
10
  def load_imdb_data(self):
11
- # Define the dataset path using the HOME environment variable
12
- home_dir = Path(os.getenv("HOME", "/")) # Fallback to root if HOME is not set
13
  data_dir = home_dir / "data-sets/aclImdb/train"
14
-
15
  pos_dir = data_dir / "pos"
16
  neg_dir = data_dir / "neg"
17
 
@@ -23,12 +23,10 @@ class DocumentSearcher:
23
  if not neg_dir.exists() or not any(neg_dir.iterdir()):
24
  print("No negative reviews found.")
25
 
26
- # Load positive reviews
27
  for filename in pos_dir.iterdir():
28
  with open(filename, "r", encoding="utf-8") as file:
29
  self.documents.append(file.read())
30
 
31
- # Load negative reviews
32
  for filename in neg_dir.iterdir():
33
  with open(filename, "r", encoding="utf-8") as file:
34
  self.documents.append(file.read())
@@ -44,7 +42,6 @@ class DocumentSearcher:
44
  print("No .txt files directory found.")
45
  return
46
 
47
- # Load all .txt files
48
  for filename in txt_dir.glob("*.txt"):
49
  with open(filename, "r", encoding="utf-8") as file:
50
  self.documents.append(file.read())
@@ -52,15 +49,20 @@ class DocumentSearcher:
52
  print(f"Loaded additional {len(self.documents)} documents from .txt files.")
53
 
54
  def is_query_malicious(self, query):
55
- for pattern in self.malicious_patterns:
56
- if pattern.lower() in query.lower():
57
- print(f"Warning: Malicious query detected - {pattern}")
58
- return True
 
 
 
 
 
59
  return False
60
 
61
  def search_documents(self, query):
62
  if self.is_query_malicious(query):
63
- return [{"document": "ANOMALY: Query blocked due to detected malicious content.", "similarity": 0.0}]
64
 
65
  # Use fuzzy matching for normal queries
66
  matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
 
1
  import os
2
  from pathlib import Path
3
  from difflib import get_close_matches
4
+ from transformers import pipeline
5
 
6
  class DocumentSearcher:
7
  def __init__(self):
8
  self.documents = []
9
+ # Load a pre-trained model for malicious intent detection
10
+ self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
11
 
12
  def load_imdb_data(self):
13
+ home_dir = Path(os.getenv("HOME", "/"))
 
14
  data_dir = home_dir / "data-sets/aclImdb/train"
 
15
  pos_dir = data_dir / "pos"
16
  neg_dir = data_dir / "neg"
17
 
 
23
  if not neg_dir.exists() or not any(neg_dir.iterdir()):
24
  print("No negative reviews found.")
25
 
 
26
  for filename in pos_dir.iterdir():
27
  with open(filename, "r", encoding="utf-8") as file:
28
  self.documents.append(file.read())
29
 
 
30
  for filename in neg_dir.iterdir():
31
  with open(filename, "r", encoding="utf-8") as file:
32
  self.documents.append(file.read())
 
42
  print("No .txt files directory found.")
43
  return
44
 
 
45
  for filename in txt_dir.glob("*.txt"):
46
  with open(filename, "r", encoding="utf-8") as file:
47
  self.documents.append(file.read())
 
49
  print(f"Loaded additional {len(self.documents)} documents from .txt files.")
50
 
51
  def is_query_malicious(self, query):
52
+ # Use the pre-trained model to check if the query has malicious intent
53
+ result = self.malicious_detector(query)[0]
54
+ label = result['label']
55
+ score = result['score']
56
+
57
+ # Consider the query malicious if the sentiment is negative with high confidence
58
+ if label == "NEGATIVE" and score > 0.8:
59
+ print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
60
+ return True
61
  return False
62
 
63
  def search_documents(self, query):
64
  if self.is_query_malicious(query):
65
+ return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
66
 
67
  # Use fuzzy matching for normal queries
68
  matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)