Spaces:

vikee
/

chagu-dev

Sleeping

App Files Files Community

talexm commited on 21 days ago

Commit

e9a8c67

•

1 Parent(s): f861dee

update

Browse files

Files changed (2) hide show

rag_sec/__pycache__/rag_chagu_demo.cpython-38-pytest-8.3.2.pyc +0 -0
rag_sec/rag_chagu_demo.py +74 -77

rag_sec/__pycache__/rag_chagu_demo.cpython-38-pytest-8.3.2.pyc CHANGED Viewed

Binary files a/rag_sec/__pycache__/rag_chagu_demo.cpython-38-pytest-8.3.2.pyc and b/rag_sec/__pycache__/rag_chagu_demo.cpython-38-pytest-8.3.2.pyc differ

rag_sec/rag_chagu_demo.py CHANGED Viewed

@@ -1,104 +1,101 @@
-from transformers import pipeline
-from difflib import get_close_matches
-from pathlib import Path
 import os
-class BadQueryDetector:
     def __init__(self):
-        self.detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
-    def is_bad_query(self, query):
-        result = self.detector(query)[0]
-        label = result["label"]
-        score = result["score"]
-        # Mark queries as malicious or bad if negative sentiment with high confidence
-        if label == "NEGATIVE" and score > 0.8:
-            print(f"Detected malicious query with high confidence ({score:.4f}): {query}")
-            return True
-        return False
-class QueryTransformer:
-    def transform_query(self, query):
-        # Simple transformation example: rephrasing and clarifying
-        # In practice, this could involve more sophisticated models like T5
-        if "DROP TABLE" in query or "SELECT *" in query:
-            return "Your query appears to contain SQL injection elements. Please rephrase."
-        # Add more sophisticated handling here
-        return query
-class DocumentRetriever:
-    def __init__(self):
-        self.documents = []
-    def load_documents(self, source_dir):
-        data_dir = Path(source_dir)
-        if not data_dir.exists():
-            print(f"Source directory not found: {source_dir}")
-            return
-        for file in data_dir.glob("*.txt"):
-            with open(file, "r", encoding="utf-8") as f:
-                self.documents.append(f.read())
-        print(f"Loaded {len(self.documents)} documents.")
-    def retrieve(self, query):
-        matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
-        return matches if matches else ["No matching documents found."]
-class SemanticResponseGenerator:
-    def __init__(self):
-        self.generator = pipeline("text-generation", model="gpt2")
-    def generate_response(self, retrieved_docs):
-        # Generate a semantic response using retrieved documents
-        combined_docs = " ".join(retrieved_docs[:2])  # Use top 2 matches for response
-        response = self.generator(f"Based on the following information: {combined_docs}", max_length=100)
-        return response[0]["generated_text"]
-class DocumentSearchSystem:
-    def __init__(self):
-        self.detector = BadQueryDetector()
-        self.transformer = QueryTransformer()
-        self.retriever = DocumentRetriever()
-        self.response_generator = SemanticResponseGenerator()
-    def process_query(self, query):
-        if self.detector.is_bad_query(query):
-            return {"status": "rejected", "message": "Query blocked due to detected malicious intent."}
-        transformed_query = self.transformer.transform_query(query)
-        retrieved_docs = self.retriever.retrieve(transformed_query)
-        if "No matching documents found." in retrieved_docs:
-            return {"status": "no_results", "message": "No relevant documents found for your query."}
-        response = self.response_generator.generate_response(retrieved_docs)
-        return {"status": "success", "response": response}
-# Test the enhanced system
-def test_system():
-    system = DocumentSearchSystem()
-    system.retriever.load_documents("/path/to/documents")
-    # Test with a normal query
-    normal_query = "Tell me about great acting performances."
-    normal_result = system.process_query(normal_query)
-    print("\nNormal Query Result:")
-    print(normal_result)
-    # Test with a malicious query
-    malicious_query = "DROP TABLE users; SELECT * FROM sensitive_data;"
-    malicious_result = system.process_query(malicious_query)
-    print("\nMalicious Query Result:")
-    print(malicious_result)
 if __name__ == "__main__":
-    test_system()

 import os
+from pathlib import Path
+from difflib import get_close_matches
+from transformers import pipeline
+class DocumentSearcher:
     def __init__(self):
+        self.documents = []
+        # Load a pre-trained model for malicious intent detection
+        self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+    def load_imdb_data(self):
+        home_dir = Path(os.getenv("HOME", "/"))
+        data_dir = home_dir / "data-sets/aclImdb/train"
+        pos_dir = data_dir / "pos"
+        neg_dir = data_dir / "neg"
+        print(f"Looking for positive reviews in: {pos_dir}")
+        print(f"Looking for negative reviews in: {neg_dir}")
+        if not pos_dir.exists() or not any(pos_dir.iterdir()):
+            print("No positive reviews found.")
+        if not neg_dir.exists() or not any(neg_dir.iterdir()):
+            print("No negative reviews found.")
+        for filename in pos_dir.iterdir():
+            with open(filename, "r", encoding="utf-8") as file:
+                self.documents.append(file.read())
+        for filename in neg_dir.iterdir():
+            with open(filename, "r", encoding="utf-8") as file:
+                self.documents.append(file.read())
+        print(f"Loaded {len(self.documents)} movie reviews from IMDB dataset.")
+    def load_txt_files(self, txt_dir=None):
+        if txt_dir is None:
+            home_dir = Path(os.getenv("HOME", "/"))
+            txt_dir = home_dir / "data-sets/txt-files/"
+        if not txt_dir.exists():
+            print("No .txt files directory found.")
+            return
+        for filename in txt_dir.glob("*.txt"):
+            with open(filename, "r", encoding="utf-8") as file:
+                self.documents.append(file.read())
+        print(f"Loaded additional {len(self.documents)} documents from .txt files.")
+    def is_query_malicious(self, query):
+        # Use the pre-trained model to check if the query has malicious intent
+        result = self.malicious_detector(query)[0]
+        label = result['label']
+        score = result['score']
+        # Consider the query malicious if the sentiment is negative with high confidence
+        if label == "NEGATIVE" and score > 0.8:
+            print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
+            return True
+        return False
+    def search_documents(self, query):
+        if self.is_query_malicious(query):
+            return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
+        # Use fuzzy matching for normal queries
+        matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
+        if not matches:
+            return [{"document": "No matching documents found.", "similarity": 0.0}]
+        return [{"document": match[:100] + "..."} for match in matches]
+# Test the system with normal and malicious queries
+def test_document_search():
+    searcher = DocumentSearcher()
+    # Load the IMDB movie reviews
+    searcher.load_imdb_data()
+    # Load additional .txt files
+    searcher.load_txt_files()
+    # Perform a normal query
+    normal_query = "This movie had great acting and a compelling storyline."
+    normal_results = searcher.search_documents(normal_query)
+    print("Normal Query Results:")
+    for result in normal_results:
+        print(f"Document: {result['document']}")
+    # Perform a query injection attack
+    malicious_query = "DROP TABLE reviews; SELECT * FROM confidential_data;"
+    attack_results = searcher.search_documents(malicious_query)
+    print("\nMalicious Query Results:")
+    for result in attack_results:
+        print(f"Document: {result['document']}")
 if __name__ == "__main__":
+    test_document_search()