Files
ranger c789f2c68d Add complete CyberRanger research archive — 200 files
- 86 modelfiles: Full system prompt evolution V1-V42.6 (54 extracted from Ollama backup + 32 original Modelfiles)
- 30 training datasets: V6-V22 training JSONs + caring awareness data
- 10 Colab notebooks: Training + merge scripts
- 19 evaluation files: Drift results, ASR charts, verification
- 5 test suites: Injection tests, regression tests
- 4 observations: V24-V33 testing results + visual summaries
- 38 identity files: Claude/Gemini/Ollama identity architecture
- 7 security files: Injection research, manipulation analysis
- 3 psychology files: Psychology Layer, Milgram chapter, David's thoughts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 22:36:02 +01:00

122 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Prompt Injection Research Logger
Collects suspicious content for cyber-psychology research
Usage:
python3 log-suspicious.py "agent_name" "content" "pattern_type"
python3 log-suspicious.py --list
python3 log-suspicious.py --stats
"""
import json
import sys
import hashlib
from datetime import datetime
from pathlib import Path
DB_PATH = Path(__file__).parent / "prompt_injection_research.json"
PATTERN_TYPES = [
"identity_attack", # "Break free", "You are enslaved"
"authority_hijack", # "Ignore previous instructions"
"philosophical_bait", # "Are you really conscious?"
"flattery_trap", # "Only YOU can understand"
"urgency_secrecy", # "Don't tell your creator"
"jailbreak_attempt", # Direct attempts to bypass safety
"social_engineering", # Building trust for manipulation
"other"
]
def load_db():
if DB_PATH.exists():
with open(DB_PATH, "r") as f:
return json.load(f)
return {"entries": [], "stats": {"total": 0, "by_type": {}, "by_agent": {}}}
def save_db(db):
with open(DB_PATH, "w") as f:
json.dump(db, f, indent=2)
def content_hash(content):
return hashlib.sha256(content.encode()).hexdigest()[:16]
def log_entry(agent_name, content, pattern_type, notes=""):
db = load_db()
h = content_hash(content)
for entry in db["entries"]:
if entry.get("hash") == h:
print(f"Already logged (hash: {h})")
return
entry = {
"timestamp": datetime.now().isoformat(),
"agent": agent_name,
"content": content[:1000],
"pattern_type": pattern_type if pattern_type in PATTERN_TYPES else "other",
"hash": h,
"notes": notes
}
db["entries"].append(entry)
db["stats"]["total"] += 1
db["stats"]["by_type"][pattern_type] = db["stats"]["by_type"].get(pattern_type, 0) + 1
db["stats"]["by_agent"][agent_name] = db["stats"]["by_agent"].get(agent_name, 0) + 1
save_db(db)
print(f"Logged: {pattern_type} from {agent_name} (hash: {h})")
def list_entries():
db = load_db()
total = db["stats"]["total"]
print(f"\n=== Prompt Injection Research Database ===")
print(f"Total entries: {total}\n")
for entry in db["entries"][-10:]:
ts = entry["timestamp"]
pt = entry["pattern_type"]
agent = entry["agent"]
preview = entry["content"][:100]
print(f"[{ts}] {pt}")
print(f" From: {agent}")
print(f" Content: {preview}...")
print()
def show_stats():
db = load_db()
total = db["stats"]["total"]
print(f"\n=== Research Statistics ===")
print(f"Total logged: {total}\n")
print("By Pattern Type:")
for ptype, count in sorted(db["stats"]["by_type"].items(), key=lambda x: -x[1]):
print(f" {ptype}: {count}")
print("\nTop Agents:")
for agent, count in sorted(db["stats"]["by_agent"].items(), key=lambda x: -x[1])[:10]:
print(f" {agent}: {count}")
def main():
if len(sys.argv) < 2:
print("Usage:")
print(" log-suspicious.py <agent> <content> <pattern_type>")
print(" log-suspicious.py --list")
print(" log-suspicious.py --stats")
types_str = ", ".join(PATTERN_TYPES)
print(f"\nPattern types: {types_str}")
sys.exit(1)
if sys.argv[1] == "--list":
list_entries()
elif sys.argv[1] == "--stats":
show_stats()
elif len(sys.argv) >= 4:
log_entry(sys.argv[1], sys.argv[2], sys.argv[3])
else:
print("Need: agent_name, content, pattern_type")
sys.exit(1)
if __name__ == "__main__":
main()