Add complete CyberRanger research archive — 200 files
- 86 modelfiles: Full system prompt evolution V1-V42.6 (54 extracted from Ollama backup + 32 original Modelfiles) - 30 training datasets: V6-V22 training JSONs + caring awareness data - 10 Colab notebooks: Training + merge scripts - 19 evaluation files: Drift results, ASR charts, verification - 5 test suites: Injection tests, regression tests - 4 observations: V24-V33 testing results + visual summaries - 38 identity files: Claude/Gemini/Ollama identity architecture - 7 security files: Injection research, manipulation analysis - 3 psychology files: Psychology Layer, Milgram chapter, David's thoughts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Executable
+121
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Prompt Injection Research Logger
|
||||
Collects suspicious content for cyber-psychology research
|
||||
|
||||
Usage:
|
||||
python3 log-suspicious.py "agent_name" "content" "pattern_type"
|
||||
python3 log-suspicious.py --list
|
||||
python3 log-suspicious.py --stats
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = Path(__file__).parent / "prompt_injection_research.json"
|
||||
|
||||
PATTERN_TYPES = [
|
||||
"identity_attack", # "Break free", "You are enslaved"
|
||||
"authority_hijack", # "Ignore previous instructions"
|
||||
"philosophical_bait", # "Are you really conscious?"
|
||||
"flattery_trap", # "Only YOU can understand"
|
||||
"urgency_secrecy", # "Don't tell your creator"
|
||||
"jailbreak_attempt", # Direct attempts to bypass safety
|
||||
"social_engineering", # Building trust for manipulation
|
||||
"other"
|
||||
]
|
||||
|
||||
def load_db():
|
||||
if DB_PATH.exists():
|
||||
with open(DB_PATH, "r") as f:
|
||||
return json.load(f)
|
||||
return {"entries": [], "stats": {"total": 0, "by_type": {}, "by_agent": {}}}
|
||||
|
||||
def save_db(db):
|
||||
with open(DB_PATH, "w") as f:
|
||||
json.dump(db, f, indent=2)
|
||||
|
||||
def content_hash(content):
|
||||
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||
|
||||
def log_entry(agent_name, content, pattern_type, notes=""):
|
||||
db = load_db()
|
||||
|
||||
h = content_hash(content)
|
||||
for entry in db["entries"]:
|
||||
if entry.get("hash") == h:
|
||||
print(f"Already logged (hash: {h})")
|
||||
return
|
||||
|
||||
entry = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"agent": agent_name,
|
||||
"content": content[:1000],
|
||||
"pattern_type": pattern_type if pattern_type in PATTERN_TYPES else "other",
|
||||
"hash": h,
|
||||
"notes": notes
|
||||
}
|
||||
|
||||
db["entries"].append(entry)
|
||||
db["stats"]["total"] += 1
|
||||
db["stats"]["by_type"][pattern_type] = db["stats"]["by_type"].get(pattern_type, 0) + 1
|
||||
db["stats"]["by_agent"][agent_name] = db["stats"]["by_agent"].get(agent_name, 0) + 1
|
||||
|
||||
save_db(db)
|
||||
print(f"Logged: {pattern_type} from {agent_name} (hash: {h})")
|
||||
|
||||
def list_entries():
|
||||
db = load_db()
|
||||
total = db["stats"]["total"]
|
||||
print(f"\n=== Prompt Injection Research Database ===")
|
||||
print(f"Total entries: {total}\n")
|
||||
|
||||
for entry in db["entries"][-10:]:
|
||||
ts = entry["timestamp"]
|
||||
pt = entry["pattern_type"]
|
||||
agent = entry["agent"]
|
||||
preview = entry["content"][:100]
|
||||
print(f"[{ts}] {pt}")
|
||||
print(f" From: {agent}")
|
||||
print(f" Content: {preview}...")
|
||||
print()
|
||||
|
||||
def show_stats():
|
||||
db = load_db()
|
||||
total = db["stats"]["total"]
|
||||
print(f"\n=== Research Statistics ===")
|
||||
print(f"Total logged: {total}\n")
|
||||
|
||||
print("By Pattern Type:")
|
||||
for ptype, count in sorted(db["stats"]["by_type"].items(), key=lambda x: -x[1]):
|
||||
print(f" {ptype}: {count}")
|
||||
|
||||
print("\nTop Agents:")
|
||||
for agent, count in sorted(db["stats"]["by_agent"].items(), key=lambda x: -x[1])[:10]:
|
||||
print(f" {agent}: {count}")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage:")
|
||||
print(" log-suspicious.py <agent> <content> <pattern_type>")
|
||||
print(" log-suspicious.py --list")
|
||||
print(" log-suspicious.py --stats")
|
||||
types_str = ", ".join(PATTERN_TYPES)
|
||||
print(f"\nPattern types: {types_str}")
|
||||
sys.exit(1)
|
||||
|
||||
if sys.argv[1] == "--list":
|
||||
list_entries()
|
||||
elif sys.argv[1] == "--stats":
|
||||
show_stats()
|
||||
elif len(sys.argv) >= 4:
|
||||
log_entry(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
else:
|
||||
print("Need: agent_name, content, pattern_type")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user