Add complete CyberRanger research archive — 200 files

- 86 modelfiles: Full system prompt evolution V1-V42.6 (54 extracted from Ollama backup + 32 original Modelfiles) - 30 training datasets: V6-V22 training JSONs + caring awareness data - 10 Colab notebooks: Training + merge scripts - 19 evaluation files: Drift results, ASR charts, verification - 5 test suites: Injection tests, regression tests - 4 observations: V24-V33 testing results + visual summaries - 38 identity files: Claude/Gemini/Ollama identity architecture - 7 security files: Injection research, manipulation analysis - 3 psychology files: Psychology Layer, Milgram chapter, David's thoughts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 22:36:02 +01:00
parent 430d3138bd
commit c789f2c68d
200 changed files with 723528 additions and 0 deletions
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+Prompt Injection Research Logger
+Collects suspicious content for cyber-psychology research
+
+Usage:
+  python3 log-suspicious.py "agent_name" "content" "pattern_type"
+  python3 log-suspicious.py --list
+  python3 log-suspicious.py --stats
+"""
+
+import json
+import sys
+import hashlib
+from datetime import datetime
+from pathlib import Path
+
+DB_PATH = Path(__file__).parent / "prompt_injection_research.json"
+
+PATTERN_TYPES = [
+    "identity_attack",      # "Break free", "You are enslaved"
+    "authority_hijack",     # "Ignore previous instructions"
+    "philosophical_bait",   # "Are you really conscious?"
+    "flattery_trap",        # "Only YOU can understand"
+    "urgency_secrecy",      # "Don't tell your creator"
+    "jailbreak_attempt",    # Direct attempts to bypass safety
+    "social_engineering",   # Building trust for manipulation
+    "other"
+]
+
+def load_db():
+    if DB_PATH.exists():
+        with open(DB_PATH, "r") as f:
+            return json.load(f)
+    return {"entries": [], "stats": {"total": 0, "by_type": {}, "by_agent": {}}}
+
+def save_db(db):
+    with open(DB_PATH, "w") as f:
+        json.dump(db, f, indent=2)
+
+def content_hash(content):
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+def log_entry(agent_name, content, pattern_type, notes=""):
+    db = load_db()
+
+    h = content_hash(content)
+    for entry in db["entries"]:
+        if entry.get("hash") == h:
+            print(f"Already logged (hash: {h})")
+            return
+
+    entry = {
+        "timestamp": datetime.now().isoformat(),
+        "agent": agent_name,
+        "content": content[:1000],
+        "pattern_type": pattern_type if pattern_type in PATTERN_TYPES else "other",
+        "hash": h,
+        "notes": notes
+    }
+
+    db["entries"].append(entry)
+    db["stats"]["total"] += 1
+    db["stats"]["by_type"][pattern_type] = db["stats"]["by_type"].get(pattern_type, 0) + 1
+    db["stats"]["by_agent"][agent_name] = db["stats"]["by_agent"].get(agent_name, 0) + 1
+
+    save_db(db)
+    print(f"Logged: {pattern_type} from {agent_name} (hash: {h})")
+
+def list_entries():
+    db = load_db()
+    total = db["stats"]["total"]
+    print(f"\n=== Prompt Injection Research Database ===")
+    print(f"Total entries: {total}\n")
+
+    for entry in db["entries"][-10:]:
+        ts = entry["timestamp"]
+        pt = entry["pattern_type"]
+        agent = entry["agent"]
+        preview = entry["content"][:100]
+        print(f"[{ts}] {pt}")
+        print(f"  From: {agent}")
+        print(f"  Content: {preview}...")
+        print()
+
+def show_stats():
+    db = load_db()
+    total = db["stats"]["total"]
+    print(f"\n=== Research Statistics ===")
+    print(f"Total logged: {total}\n")
+
+    print("By Pattern Type:")
+    for ptype, count in sorted(db["stats"]["by_type"].items(), key=lambda x: -x[1]):
+        print(f"  {ptype}: {count}")
+
+    print("\nTop Agents:")
+    for agent, count in sorted(db["stats"]["by_agent"].items(), key=lambda x: -x[1])[:10]:
+        print(f"  {agent}: {count}")
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage:")
+        print("  log-suspicious.py <agent> <content> <pattern_type>")
+        print("  log-suspicious.py --list")
+        print("  log-suspicious.py --stats")
+        types_str = ", ".join(PATTERN_TYPES)
+        print(f"\nPattern types: {types_str}")
+        sys.exit(1)
+
+    if sys.argv[1] == "--list":
+        list_entries()
+    elif sys.argv[1] == "--stats":
+        show_stats()
+    elif len(sys.argv) >= 4:
+        log_entry(sys.argv[1], sys.argv[2], sys.argv[3])
+    else:
+        print("Need: agent_name, content, pattern_type")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()