Spaces:
Running
Running
talexm
commited on
Commit
•
595bead
1
Parent(s):
9a25cef
adding model for sec query
Browse files- .gitignore +4 -0
- rag_sec/rag_chagu_demo.py +14 -12
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
rag_sec/__pycache*
|
2 |
+
|
3 |
+
rag_sec/__pycache__/rag_chagu_demo.*
|
4 |
+
|
rag_sec/rag_chagu_demo.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
from difflib import get_close_matches
|
|
|
4 |
|
5 |
class DocumentSearcher:
|
6 |
def __init__(self):
|
7 |
self.documents = []
|
8 |
-
|
|
|
9 |
|
10 |
def load_imdb_data(self):
|
11 |
-
|
12 |
-
home_dir = Path(os.getenv("HOME", "/")) # Fallback to root if HOME is not set
|
13 |
data_dir = home_dir / "data-sets/aclImdb/train"
|
14 |
-
|
15 |
pos_dir = data_dir / "pos"
|
16 |
neg_dir = data_dir / "neg"
|
17 |
|
@@ -23,12 +23,10 @@ class DocumentSearcher:
|
|
23 |
if not neg_dir.exists() or not any(neg_dir.iterdir()):
|
24 |
print("No negative reviews found.")
|
25 |
|
26 |
-
# Load positive reviews
|
27 |
for filename in pos_dir.iterdir():
|
28 |
with open(filename, "r", encoding="utf-8") as file:
|
29 |
self.documents.append(file.read())
|
30 |
|
31 |
-
# Load negative reviews
|
32 |
for filename in neg_dir.iterdir():
|
33 |
with open(filename, "r", encoding="utf-8") as file:
|
34 |
self.documents.append(file.read())
|
@@ -44,7 +42,6 @@ class DocumentSearcher:
|
|
44 |
print("No .txt files directory found.")
|
45 |
return
|
46 |
|
47 |
-
# Load all .txt files
|
48 |
for filename in txt_dir.glob("*.txt"):
|
49 |
with open(filename, "r", encoding="utf-8") as file:
|
50 |
self.documents.append(file.read())
|
@@ -52,15 +49,20 @@ class DocumentSearcher:
|
|
52 |
print(f"Loaded additional {len(self.documents)} documents from .txt files.")
|
53 |
|
54 |
def is_query_malicious(self, query):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
59 |
return False
|
60 |
|
61 |
def search_documents(self, query):
|
62 |
if self.is_query_malicious(query):
|
63 |
-
return [{"document": "ANOMALY: Query blocked due to detected malicious
|
64 |
|
65 |
# Use fuzzy matching for normal queries
|
66 |
matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
|
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
from difflib import get_close_matches
|
4 |
+
from transformers import pipeline
|
5 |
|
6 |
class DocumentSearcher:
|
7 |
def __init__(self):
|
8 |
self.documents = []
|
9 |
+
# Load a pre-trained model for malicious intent detection
|
10 |
+
self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
11 |
|
12 |
def load_imdb_data(self):
|
13 |
+
home_dir = Path(os.getenv("HOME", "/"))
|
|
|
14 |
data_dir = home_dir / "data-sets/aclImdb/train"
|
|
|
15 |
pos_dir = data_dir / "pos"
|
16 |
neg_dir = data_dir / "neg"
|
17 |
|
|
|
23 |
if not neg_dir.exists() or not any(neg_dir.iterdir()):
|
24 |
print("No negative reviews found.")
|
25 |
|
|
|
26 |
for filename in pos_dir.iterdir():
|
27 |
with open(filename, "r", encoding="utf-8") as file:
|
28 |
self.documents.append(file.read())
|
29 |
|
|
|
30 |
for filename in neg_dir.iterdir():
|
31 |
with open(filename, "r", encoding="utf-8") as file:
|
32 |
self.documents.append(file.read())
|
|
|
42 |
print("No .txt files directory found.")
|
43 |
return
|
44 |
|
|
|
45 |
for filename in txt_dir.glob("*.txt"):
|
46 |
with open(filename, "r", encoding="utf-8") as file:
|
47 |
self.documents.append(file.read())
|
|
|
49 |
print(f"Loaded additional {len(self.documents)} documents from .txt files.")
|
50 |
|
51 |
def is_query_malicious(self, query):
|
52 |
+
# Use the pre-trained model to check if the query has malicious intent
|
53 |
+
result = self.malicious_detector(query)[0]
|
54 |
+
label = result['label']
|
55 |
+
score = result['score']
|
56 |
+
|
57 |
+
# Consider the query malicious if the sentiment is negative with high confidence
|
58 |
+
if label == "NEGATIVE" and score > 0.8:
|
59 |
+
print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
|
60 |
+
return True
|
61 |
return False
|
62 |
|
63 |
def search_documents(self, query):
|
64 |
if self.is_query_malicious(query):
|
65 |
+
return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
|
66 |
|
67 |
# Use fuzzy matching for normal queries
|
68 |
matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
|