youssefleb commited on
Commit
6e30edd
·
verified ·
1 Parent(s): e5b1c26

Update mcp_servers.py

Browse files
Files changed (1) hide show
  1. mcp_servers.py +82 -10
mcp_servers.py CHANGED
@@ -1,4 +1,4 @@
1
- # mcp_servers.py (FIXED: Robust 'str' handling & Enhanced Extraction)
2
  import asyncio
3
  import json
4
  import re
@@ -12,10 +12,58 @@ from personas import PERSONAS_DATA
12
 
13
  EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def extract_json(text: str) -> dict:
16
  """Robustly extracts JSON from text."""
17
  try:
18
- # 1. Try simple block extraction first
19
  clean_text = text.strip()
20
  if "```json" in clean_text:
21
  clean_text = clean_text.split("```json")[1].split("```")[0].strip()
@@ -23,10 +71,7 @@ def extract_json(text: str) -> dict:
23
  clean_text = clean_text.split("```")[1].split("```")[0].strip()
24
  return json.loads(clean_text)
25
  except (json.JSONDecodeError, IndexError):
26
- # 2. Fallback: Regex to find the outermost JSON object
27
  try:
28
- # Looks for { ... } but matching closest brackets is hard with regex.
29
- # This regex grabs from the first { to the last }
30
  match = re.search(r'(\{[\s\S]*\})', text)
31
  if match:
32
  return json.loads(match.group(1))
@@ -44,13 +89,34 @@ class BusinessSolutionEvaluator:
44
 
45
  async def evaluate(self, problem: str, solution_text: str) -> dict:
46
  print(f"Evaluating solution (live): {solution_text[:50]}...")
47
- prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  try:
 
50
  response = await self.gemini_model.generate_content_async(
51
- prompt,
52
  generation_config=genai.types.GenerationConfig(
53
- response_mime_type="application/json"
 
54
  )
55
  )
56
 
@@ -80,9 +146,12 @@ class AgentCalibrator:
80
  self.sponsor_llms = list(self.api_clients.keys())
81
  print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
82
 
 
83
  async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]]]:
84
  print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
85
  error_log = []
 
 
86
  if not self.sponsor_llms:
87
  raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
88
 
@@ -108,7 +177,8 @@ class AgentCalibrator:
108
  tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
109
 
110
  results = await asyncio.gather(*tasks)
111
-
 
112
  best_llms = {}
113
  role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
114
 
@@ -152,7 +222,9 @@ class AgentCalibrator:
152
  "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
153
  }
154
  print(f"Calibration complete (live). Team plan: {team_plan}")
155
- return team_plan, error_log, results
 
 
156
 
157
  async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
158
  print(f"...Calibrating {role} on {llm_name}...")
 
1
+ # mcp_servers.py (FIXED: Prompt-Schema Alignment + Detailed Logging)
2
  import asyncio
3
  import json
4
  import re
 
12
 
13
  EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
14
 
15
+ # --- DEFINING THE SCHEMA TO FORCE JUSTIFICATIONS ---
16
+ # This forces the LLM to return a specific JSON structure with "score" and "justification"
17
+ EVALUATION_SCHEMA = {
18
+ "type": "OBJECT",
19
+ "properties": {
20
+ "Novelty": {
21
+ "type": "OBJECT",
22
+ "properties": {
23
+ "score": {"type": "INTEGER"},
24
+ "justification": {"type": "STRING"}
25
+ },
26
+ "required": ["score", "justification"]
27
+ },
28
+ "Usefulness_Feasibility": {
29
+ "type": "OBJECT",
30
+ "properties": {
31
+ "score": {"type": "INTEGER"},
32
+ "justification": {"type": "STRING"}
33
+ },
34
+ "required": ["score", "justification"]
35
+ },
36
+ "Flexibility": {
37
+ "type": "OBJECT",
38
+ "properties": {
39
+ "score": {"type": "INTEGER"},
40
+ "justification": {"type": "STRING"}
41
+ },
42
+ "required": ["score", "justification"]
43
+ },
44
+ "Elaboration": {
45
+ "type": "OBJECT",
46
+ "properties": {
47
+ "score": {"type": "INTEGER"},
48
+ "justification": {"type": "STRING"}
49
+ },
50
+ "required": ["score", "justification"]
51
+ },
52
+ "Cultural_Appropriateness": {
53
+ "type": "OBJECT",
54
+ "properties": {
55
+ "score": {"type": "INTEGER"},
56
+ "justification": {"type": "STRING"}
57
+ },
58
+ "required": ["score", "justification"]
59
+ }
60
+ },
61
+ "required": ["Novelty", "Usefulness_Feasibility", "Flexibility", "Elaboration", "Cultural_Appropriateness"]
62
+ }
63
+
64
  def extract_json(text: str) -> dict:
65
  """Robustly extracts JSON from text."""
66
  try:
 
67
  clean_text = text.strip()
68
  if "```json" in clean_text:
69
  clean_text = clean_text.split("```json")[1].split("```")[0].strip()
 
71
  clean_text = clean_text.split("```")[1].split("```")[0].strip()
72
  return json.loads(clean_text)
73
  except (json.JSONDecodeError, IndexError):
 
74
  try:
 
 
75
  match = re.search(r'(\{[\s\S]*\})', text)
76
  if match:
77
  return json.loads(match.group(1))
 
89
 
90
  async def evaluate(self, problem: str, solution_text: str) -> dict:
91
  print(f"Evaluating solution (live): {solution_text[:50]}...")
92
+
93
+ # 1. Base Prompt from the clean text file
94
+ base_prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
95
+
96
+ # 2. INJECT STRONG INSTRUCTION
97
+ # This prevents the model from regurgitating examples found in the prompt file
98
+ schema_instruction = """
99
+
100
+ [IMPORTANT SYSTEM INSTRUCTION]
101
+ Ignore any previous examples of JSON formatting in this prompt.
102
+ You MUST strictly follow the Output Schema provided below.
103
+
104
+ For EACH of the 5 metrics (Novelty, Usefulness_Feasibility, etc.), you must provide an object with TWO fields:
105
+ 1. "score": An integer from 1 to 5.
106
+ 2. "justification": A specific sentence explaining why you gave that score.
107
+
108
+ Do not output a list. Return a single JSON object describing the solution above.
109
+ """
110
+
111
+ final_prompt = base_prompt + schema_instruction
112
 
113
  try:
114
+ # --- ENFORCE SCHEMA ---
115
  response = await self.gemini_model.generate_content_async(
116
+ final_prompt,
117
  generation_config=genai.types.GenerationConfig(
118
+ response_mime_type="application/json",
119
+ response_schema=EVALUATION_SCHEMA
120
  )
121
  )
122
 
 
146
  self.sponsor_llms = list(self.api_clients.keys())
147
  print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
148
 
149
+ # --- UPDATED: Return detailed results for logging ---
150
  async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]]]:
151
  print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
152
  error_log = []
153
+ detailed_results = [] # To capture full calibration data
154
+
155
  if not self.sponsor_llms:
156
  raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
157
 
 
177
  tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
178
 
179
  results = await asyncio.gather(*tasks)
180
+ detailed_results = results # Store the full results here
181
+
182
  best_llms = {}
183
  role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
184
 
 
222
  "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
223
  }
224
  print(f"Calibration complete (live). Team plan: {team_plan}")
225
+
226
+ # Return 3 values: The plan, errors, and the full trace
227
+ return team_plan, error_log, detailed_results
228
 
229
  async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
230
  print(f"...Calibrating {role} on {llm_name}...")