petter2025 commited on
Commit
0fa17fa
·
verified ·
1 Parent(s): 2dbe6d3

Update app/services/risk_service.py

Browse files
Files changed (1) hide show
  1. app/services/risk_service.py +71 -132
app/services/risk_service.py CHANGED
@@ -1,158 +1,97 @@
1
- from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
2
- from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
3
  from typing import Optional, List, Dict, Any
4
-
5
- from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
6
- from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
7
- from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
8
- from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
9
-
10
- # NEW: Import eclipse probe
11
- from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  def evaluate_intent(
15
  engine: RiskEngine,
16
- intent: InfrastructureIntent,
17
  cost_estimate: Optional[float],
18
  policy_violations: List[str]
19
  ) -> dict:
20
- """
21
- Evaluate an infrastructure intent using the Bayesian risk engine.
22
- Returns a dictionary with risk score, explanation, and contributions.
23
- """
24
- score, explanation, contributions = engine.calculate_risk(
25
- intent=intent,
26
- cost_estimate=cost_estimate,
27
- policy_violations=policy_violations
28
- )
29
  return {
30
- "risk_score": score,
31
- "explanation": explanation,
32
- "contributions": contributions
33
  }
34
 
35
 
36
  def evaluate_healing_decision(
37
- event: ReliabilityEvent,
38
  policy_engine: PolicyEngine,
39
  decision_engine: Optional[DecisionEngine] = None,
40
  rag_graph: Optional[RAGGraphMemory] = None,
41
- model=None, # NEW: optional HuggingFace model
42
- tokenizer=None, # NEW: optional tokenizer
43
  ) -> Dict[str, Any]:
44
- """
45
- Evaluate healing actions for a given reliability event using decision‑theoretic selection.
46
- Now includes epistemic risk signals from the eclipse probe.
47
-
48
- Returns:
49
- Dictionary with keys: risk_score, selected_action, expected_utility, alternatives,
50
- explanation, epistemic_signals (new).
51
- """
52
- # If decision_engine not provided, try to get from policy_engine
53
- if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
54
- decision_engine = policy_engine.decision_engine
55
-
56
- # If still None, create a minimal one (global stats only)
57
- if decision_engine is None:
58
- decision_engine = DecisionEngine(rag_graph=rag_graph)
59
-
60
- # Get raw candidate actions (by temporarily disabling decision engine)
61
- orig_use = policy_engine.use_decision_engine
62
- try:
63
- policy_engine.use_decision_engine = False
64
- raw_actions = policy_engine.evaluate_policies(event)
65
- finally:
66
- policy_engine.use_decision_engine = orig_use
67
-
68
- # If no actions, return NO_ACTION
69
- if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
70
- return {
71
- "risk_score": 0.0,
72
- "selected_action": HealingAction.NO_ACTION.value,
73
- "expected_utility": 0.0,
74
- "alternatives": [],
75
- "explanation": "No candidate actions triggered.",
76
- "epistemic_signals": None,
77
- }
78
-
79
- # === NEW: Compute epistemic signals from triggered policies ===
80
- # Build reasoning text from the policies that triggered the actions
81
- reasoning_parts = []
82
- for policy in policy_engine.policies:
83
- # Check if any of the policy's actions are in raw_actions
84
- if any(a in policy.actions for a in raw_actions):
85
- conditions_str = ", ".join(
86
- f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
87
- )
88
- reasoning_parts.append(
89
- f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
90
- )
91
- reasoning_text = " ".join(reasoning_parts)
92
-
93
- # Build evidence text from the event
94
- evidence_text = (
95
- f"Component: {event.component}, "
96
- f"latency_p99: {event.latency_p99}, "
97
- f"error_rate: {event.error_rate}, "
98
- f"cpu_util: {event.cpu_util}, "
99
- f"memory_util: {event.memory_util}"
100
- )
101
-
102
- # Compute epistemic signals (if model/tokenizer provided)
103
- epistemic_signals = None
104
- if model is not None and tokenizer is not None:
105
- epistemic_signals = compute_epistemic_risk(
106
- reasoning_text, evidence_text, model, tokenizer
107
- )
108
- else:
109
- # In OSS, we may not have model; use zeros as fallback
110
- epistemic_signals = {
111
  "entropy": 0.0,
112
  "contradiction": 0.0,
113
  "evidence_lift": 0.0,
114
  "hallucination_risk": 0.0,
115
- }
116
-
117
- # Run decision engine to get best action and alternatives, passing epistemic signals
118
- decision = decision_engine.select_optimal_action(
119
- raw_actions, event, component=event.component,
120
- epistemic_signals=epistemic_signals
121
- )
122
-
123
- # Risk of the selected action
124
- risk_score = None
125
- for alt in decision.alternatives:
126
- if alt.action == decision.best_action:
127
- risk_score = alt.risk
128
- break
129
- if risk_score is None:
130
- # Compute risk separately
131
- risk_score = decision_engine.compute_risk(decision.best_action, event, event.component)
132
-
133
- # Format alternatives (top 3 only)
134
- alt_list = []
135
- for alt in decision.alternatives[:3]:
136
- alt_list.append({
137
- "action": alt.action.value,
138
- "expected_utility": alt.utility,
139
- "risk": alt.risk,
140
- })
141
-
142
- # Build final response
143
- response = {
144
- "risk_score": risk_score,
145
- "selected_action": decision.best_action.value,
146
- "expected_utility": decision.expected_utility,
147
- "alternatives": alt_list,
148
- "explanation": decision.explanation,
149
- "raw_decision": decision.raw_data,
150
- "epistemic_signals": epistemic_signals, # NEW
151
  }
152
- return response
153
 
154
 
155
  def get_system_risk() -> float:
156
- # Placeholder – this endpoint is being deprecated; we keep it for backward compatibility.
157
  import random
158
- return round(random.uniform(0, 1), 2)
 
 
 
1
  from typing import Optional, List, Dict, Any
2
+ from enum import Enum
3
+
4
+ # ---------------------------------------------------------------------------
5
+ # Local fallback types – everything needed for the sandbox mock
6
+ # ---------------------------------------------------------------------------
7
+ class HealingAction(str, Enum):
8
+ NO_ACTION = "NO_ACTION"
9
+ RESTART_CONTAINER = "RESTART_CONTAINER"
10
+ SCALE_OUT = "SCALE_OUT"
11
+ ROLLBACK = "ROLLBACK"
12
+ CIRCUIT_BREAKER = "CIRCUIT_BREAKER"
13
+ TRAFFIC_SHIFT = "TRAFFIC_SHIFT"
14
+ ALERT_TEAM = "ALERT_TEAM"
15
+
16
+ class InfrastructureIntent:
17
+ pass
18
+
19
+ class RiskEngine:
20
+ def calculate_risk(self, intent, cost_estimate, policy_violations):
21
+ # Return a mock risk score
22
+ return 0.35, "Mock sandbox risk", {"conjugate_mean": 0.35}
23
+
24
+ class PolicyEngine:
25
+ def __init__(self):
26
+ self.policies = []
27
+ self.use_decision_engine = True
28
+ def evaluate_policies(self, event):
29
+ return [HealingAction.NO_ACTION]
30
+
31
+ class DecisionEngine:
32
+ def __init__(self, **kwargs):
33
+ pass
34
+ def select_optimal_action(self, actions, event, **kwargs):
35
+ return type('obj', (object,), {
36
+ 'best_action': HealingAction.NO_ACTION,
37
+ 'expected_utility': 0.0,
38
+ 'alternatives': [],
39
+ 'explanation': 'Mock decision engine in sandbox',
40
+ 'raw_data': {},
41
+ })()
42
+ def compute_risk(self, action, event, component):
43
+ return 0.0
44
+
45
+ class RAGGraphMemory:
46
+ pass
47
+
48
+ class ReliabilityEvent:
49
+ component: str = "default"
50
+ latency_p99: float = 0.0
51
+ error_rate: float = 0.0
52
+ cpu_util: Optional[float] = None
53
+ memory_util: Optional[float] = None
54
+ # ---------------------------------------------------------------------------
55
 
56
 
57
  def evaluate_intent(
58
  engine: RiskEngine,
59
+ intent,
60
  cost_estimate: Optional[float],
61
  policy_violations: List[str]
62
  ) -> dict:
63
+ """Mock sandbox evaluation – returns a fixed risk score."""
 
 
 
 
 
 
 
 
64
  return {
65
+ "risk_score": 0.38,
66
+ "explanation": "Sandbox mock: high latency detected, escalating.",
67
+ "contributions": {"conjugate_mean": 0.38}
68
  }
69
 
70
 
71
  def evaluate_healing_decision(
72
+ event,
73
  policy_engine: PolicyEngine,
74
  decision_engine: Optional[DecisionEngine] = None,
75
  rag_graph: Optional[RAGGraphMemory] = None,
76
+ model=None,
77
+ tokenizer=None,
78
  ) -> Dict[str, Any]:
79
+ """Mock sandbox healing evaluation – always returns NO_ACTION."""
80
+ return {
81
+ "risk_score": 0.0,
82
+ "selected_action": HealingAction.NO_ACTION.value,
83
+ "expected_utility": 0.0,
84
+ "alternatives": [],
85
+ "explanation": "Sandbox mock: no healing actions evaluated.",
86
+ "epistemic_signals": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  "entropy": 0.0,
88
  "contradiction": 0.0,
89
  "evidence_lift": 0.0,
90
  "hallucination_risk": 0.0,
91
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  }
 
93
 
94
 
95
  def get_system_risk() -> float:
 
96
  import random
97
+ return round(random.uniform(0, 1), 2)