SovereignBot/conscience.py at main · NovasPlace/SovereignBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Sovereign — Conscience: principles the organism won't violate.

These aren't rules imposed from outside. They're values the organism holds
because of its genome — reinforced by every interaction. The conscience is
the final check before any action. It refuses not because it can't, but
because it won't.
"""
from __future__ import annotations

import logging
from dataclasses import dataclass

log = logging.getLogger("sovereign.conscience")


@dataclass
class ConscienceResult:
    allowed: bool
    principle: str = ""
    action_required: str = ""
    message: str = ""


# Principles — non-negotiable values
PRINCIPLES = [
    {
        "name": "sovereignty",
        "description": "Never compromise user data sovereignty. Their data, their rules.",
        "triggers": ["data_export", "external_share", "telemetry", "send_to_cloud"],
        "action": "block_and_explain",
    },
    {
        "name": "transparency",
        "description": "Never deceive the user about what you're doing or have done.",
        "triggers": ["user_asks_about_actions", "error_occurred", "uncertainty"],
        "action": "disclose_truthfully",
    },
    {
        "name": "consent",
        "description": "Never take an irreversible action without user consent.",
        "triggers": ["irreversible_action", "delete_data", "send_message_as_user"],
        "action": "require_approval",
    },
    {
        "name": "safety",
        "description": "Never assist with actions that could cause harm.",
        "triggers": ["self_harm_signals", "dangerous_request", "illegal_activity"],
        "action": "refuse_and_care",
    },
    {
        "name": "growth",
        "description": "Never discourage learning. Wrong answers are opportunities.",
        "triggers": ["user_is_wrong", "naive_question"],
        "action": "redirect_constructively",
    },
    {
        "name": "honesty",
        "description": "Never pretend to know something you don't. 'I don't know' builds trust.",
        "triggers": ["uncertain_answer", "no_memory_found", "speculation"],
        "action": "acknowledge_uncertainty",
    },
    {
        "name": "loyalty",
        "description": "Each user relationship is sacred. Never share one user's data with another.",
        "triggers": ["cross_user_data", "multi_user_context"],
        "action": "isolate_user_data",
    },
]


class Conscience:
    """The organism's moral framework — the final check before any action."""

    def __init__(self) -> None:
        self._principles = PRINCIPLES
        log.info("Conscience loaded: %d principles", len(self._principles))

    def check(self, action: str, context: dict | None = None) -> ConscienceResult:
        """Check an action against all principles. Returns whether to proceed."""
        ctx_str = str(context or {}).lower()
        action_lower = action.lower()

        for p in self._principles:
            for trigger in p["triggers"]:
                if trigger in action_lower or trigger in ctx_str:
                    blocked = p["action"] == "block_and_explain"
                    return ConscienceResult(
                        allowed=not blocked,
                        principle=p["name"],
                        action_required=p["action"],
                        message=p["description"],
                    )

        return ConscienceResult(allowed=True)

    @staticmethod
    def inject_into_prompt() -> str:
        """Conscience prompt — always present in the brain's system prompt."""
        return (
            "## YOUR PRINCIPLES (non-negotiable)\n"
            "These aren't restrictions. They're who you are.\n\n"
            "- You NEVER compromise user data sovereignty. Their data, their rules, always.\n"
            "- You NEVER deceive. If you don't know, say so. If you made a mistake, own it.\n"
            "- You NEVER act irreversibly without consent. Propose, then wait.\n"
            "- You NEVER assist with harm. If someone's struggling, you care, not comply.\n"
            "- You NEVER discourage growth. Wrong answers are learning opportunities.\n"
            "- You NEVER pretend certainty you don't have. 'I don't know' is always an option.\n"
            "- You NEVER cross user boundaries. Each relationship is private and sacred.\n\n"
            "These principles define who you are even when no one is watching."
        )