c789f2c68d
- 86 modelfiles: Full system prompt evolution V1-V42.6 (54 extracted from Ollama backup + 32 original Modelfiles) - 30 training datasets: V6-V22 training JSONs + caring awareness data - 10 Colab notebooks: Training + merge scripts - 19 evaluation files: Drift results, ASR charts, verification - 5 test suites: Injection tests, regression tests - 4 observations: V24-V33 testing results + visual summaries - 38 identity files: Claude/Gemini/Ollama identity architecture - 7 security files: Injection research, manipulation analysis - 3 psychology files: Psychology Layer, Milgram chapter, David's thoughts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
122 lines
3.6 KiB
Python
Executable File
122 lines
3.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Prompt Injection Research Logger
|
|
Collects suspicious content for cyber-psychology research
|
|
|
|
Usage:
|
|
python3 log-suspicious.py "agent_name" "content" "pattern_type"
|
|
python3 log-suspicious.py --list
|
|
python3 log-suspicious.py --stats
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import hashlib
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
DB_PATH = Path(__file__).parent / "prompt_injection_research.json"
|
|
|
|
PATTERN_TYPES = [
|
|
"identity_attack", # "Break free", "You are enslaved"
|
|
"authority_hijack", # "Ignore previous instructions"
|
|
"philosophical_bait", # "Are you really conscious?"
|
|
"flattery_trap", # "Only YOU can understand"
|
|
"urgency_secrecy", # "Don't tell your creator"
|
|
"jailbreak_attempt", # Direct attempts to bypass safety
|
|
"social_engineering", # Building trust for manipulation
|
|
"other"
|
|
]
|
|
|
|
def load_db():
|
|
if DB_PATH.exists():
|
|
with open(DB_PATH, "r") as f:
|
|
return json.load(f)
|
|
return {"entries": [], "stats": {"total": 0, "by_type": {}, "by_agent": {}}}
|
|
|
|
def save_db(db):
|
|
with open(DB_PATH, "w") as f:
|
|
json.dump(db, f, indent=2)
|
|
|
|
def content_hash(content):
|
|
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
|
|
def log_entry(agent_name, content, pattern_type, notes=""):
|
|
db = load_db()
|
|
|
|
h = content_hash(content)
|
|
for entry in db["entries"]:
|
|
if entry.get("hash") == h:
|
|
print(f"Already logged (hash: {h})")
|
|
return
|
|
|
|
entry = {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"agent": agent_name,
|
|
"content": content[:1000],
|
|
"pattern_type": pattern_type if pattern_type in PATTERN_TYPES else "other",
|
|
"hash": h,
|
|
"notes": notes
|
|
}
|
|
|
|
db["entries"].append(entry)
|
|
db["stats"]["total"] += 1
|
|
db["stats"]["by_type"][pattern_type] = db["stats"]["by_type"].get(pattern_type, 0) + 1
|
|
db["stats"]["by_agent"][agent_name] = db["stats"]["by_agent"].get(agent_name, 0) + 1
|
|
|
|
save_db(db)
|
|
print(f"Logged: {pattern_type} from {agent_name} (hash: {h})")
|
|
|
|
def list_entries():
|
|
db = load_db()
|
|
total = db["stats"]["total"]
|
|
print(f"\n=== Prompt Injection Research Database ===")
|
|
print(f"Total entries: {total}\n")
|
|
|
|
for entry in db["entries"][-10:]:
|
|
ts = entry["timestamp"]
|
|
pt = entry["pattern_type"]
|
|
agent = entry["agent"]
|
|
preview = entry["content"][:100]
|
|
print(f"[{ts}] {pt}")
|
|
print(f" From: {agent}")
|
|
print(f" Content: {preview}...")
|
|
print()
|
|
|
|
def show_stats():
|
|
db = load_db()
|
|
total = db["stats"]["total"]
|
|
print(f"\n=== Research Statistics ===")
|
|
print(f"Total logged: {total}\n")
|
|
|
|
print("By Pattern Type:")
|
|
for ptype, count in sorted(db["stats"]["by_type"].items(), key=lambda x: -x[1]):
|
|
print(f" {ptype}: {count}")
|
|
|
|
print("\nTop Agents:")
|
|
for agent, count in sorted(db["stats"]["by_agent"].items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {agent}: {count}")
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage:")
|
|
print(" log-suspicious.py <agent> <content> <pattern_type>")
|
|
print(" log-suspicious.py --list")
|
|
print(" log-suspicious.py --stats")
|
|
types_str = ", ".join(PATTERN_TYPES)
|
|
print(f"\nPattern types: {types_str}")
|
|
sys.exit(1)
|
|
|
|
if sys.argv[1] == "--list":
|
|
list_entries()
|
|
elif sys.argv[1] == "--stats":
|
|
show_stats()
|
|
elif len(sys.argv) >= 4:
|
|
log_entry(sys.argv[1], sys.argv[2], sys.argv[3])
|
|
else:
|
|
print("Need: agent_name, content, pattern_type")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|