Spaces:
Running
Running
Update mcp_servers.py
Browse files- mcp_servers.py +82 -10
mcp_servers.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# mcp_servers.py (FIXED:
|
| 2 |
import asyncio
|
| 3 |
import json
|
| 4 |
import re
|
|
@@ -12,10 +12,58 @@ from personas import PERSONAS_DATA
|
|
| 12 |
|
| 13 |
EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def extract_json(text: str) -> dict:
|
| 16 |
"""Robustly extracts JSON from text."""
|
| 17 |
try:
|
| 18 |
-
# 1. Try simple block extraction first
|
| 19 |
clean_text = text.strip()
|
| 20 |
if "```json" in clean_text:
|
| 21 |
clean_text = clean_text.split("```json")[1].split("```")[0].strip()
|
|
@@ -23,10 +71,7 @@ def extract_json(text: str) -> dict:
|
|
| 23 |
clean_text = clean_text.split("```")[1].split("```")[0].strip()
|
| 24 |
return json.loads(clean_text)
|
| 25 |
except (json.JSONDecodeError, IndexError):
|
| 26 |
-
# 2. Fallback: Regex to find the outermost JSON object
|
| 27 |
try:
|
| 28 |
-
# Looks for { ... } but matching closest brackets is hard with regex.
|
| 29 |
-
# This regex grabs from the first { to the last }
|
| 30 |
match = re.search(r'(\{[\s\S]*\})', text)
|
| 31 |
if match:
|
| 32 |
return json.loads(match.group(1))
|
|
@@ -44,13 +89,34 @@ class BusinessSolutionEvaluator:
|
|
| 44 |
|
| 45 |
async def evaluate(self, problem: str, solution_text: str) -> dict:
|
| 46 |
print(f"Evaluating solution (live): {solution_text[:50]}...")
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
try:
|
|
|
|
| 50 |
response = await self.gemini_model.generate_content_async(
|
| 51 |
-
|
| 52 |
generation_config=genai.types.GenerationConfig(
|
| 53 |
-
response_mime_type="application/json"
|
|
|
|
| 54 |
)
|
| 55 |
)
|
| 56 |
|
|
@@ -80,9 +146,12 @@ class AgentCalibrator:
|
|
| 80 |
self.sponsor_llms = list(self.api_clients.keys())
|
| 81 |
print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
|
| 82 |
|
|
|
|
| 83 |
async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]]]:
|
| 84 |
print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
|
| 85 |
error_log = []
|
|
|
|
|
|
|
| 86 |
if not self.sponsor_llms:
|
| 87 |
raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
|
| 88 |
|
|
@@ -108,7 +177,8 @@ class AgentCalibrator:
|
|
| 108 |
tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
|
| 109 |
|
| 110 |
results = await asyncio.gather(*tasks)
|
| 111 |
-
|
|
|
|
| 112 |
best_llms = {}
|
| 113 |
role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
|
| 114 |
|
|
@@ -152,7 +222,9 @@ class AgentCalibrator:
|
|
| 152 |
"Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
|
| 153 |
}
|
| 154 |
print(f"Calibration complete (live). Team plan: {team_plan}")
|
| 155 |
-
|
|
|
|
|
|
|
| 156 |
|
| 157 |
async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
|
| 158 |
print(f"...Calibrating {role} on {llm_name}...")
|
|
|
|
| 1 |
+
# mcp_servers.py (FIXED: Prompt-Schema Alignment + Detailed Logging)
|
| 2 |
import asyncio
|
| 3 |
import json
|
| 4 |
import re
|
|
|
|
| 12 |
|
| 13 |
EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
|
| 14 |
|
| 15 |
+
# --- DEFINING THE SCHEMA TO FORCE JUSTIFICATIONS ---
|
| 16 |
+
# This forces the LLM to return a specific JSON structure with "score" and "justification"
|
| 17 |
+
EVALUATION_SCHEMA = {
|
| 18 |
+
"type": "OBJECT",
|
| 19 |
+
"properties": {
|
| 20 |
+
"Novelty": {
|
| 21 |
+
"type": "OBJECT",
|
| 22 |
+
"properties": {
|
| 23 |
+
"score": {"type": "INTEGER"},
|
| 24 |
+
"justification": {"type": "STRING"}
|
| 25 |
+
},
|
| 26 |
+
"required": ["score", "justification"]
|
| 27 |
+
},
|
| 28 |
+
"Usefulness_Feasibility": {
|
| 29 |
+
"type": "OBJECT",
|
| 30 |
+
"properties": {
|
| 31 |
+
"score": {"type": "INTEGER"},
|
| 32 |
+
"justification": {"type": "STRING"}
|
| 33 |
+
},
|
| 34 |
+
"required": ["score", "justification"]
|
| 35 |
+
},
|
| 36 |
+
"Flexibility": {
|
| 37 |
+
"type": "OBJECT",
|
| 38 |
+
"properties": {
|
| 39 |
+
"score": {"type": "INTEGER"},
|
| 40 |
+
"justification": {"type": "STRING"}
|
| 41 |
+
},
|
| 42 |
+
"required": ["score", "justification"]
|
| 43 |
+
},
|
| 44 |
+
"Elaboration": {
|
| 45 |
+
"type": "OBJECT",
|
| 46 |
+
"properties": {
|
| 47 |
+
"score": {"type": "INTEGER"},
|
| 48 |
+
"justification": {"type": "STRING"}
|
| 49 |
+
},
|
| 50 |
+
"required": ["score", "justification"]
|
| 51 |
+
},
|
| 52 |
+
"Cultural_Appropriateness": {
|
| 53 |
+
"type": "OBJECT",
|
| 54 |
+
"properties": {
|
| 55 |
+
"score": {"type": "INTEGER"},
|
| 56 |
+
"justification": {"type": "STRING"}
|
| 57 |
+
},
|
| 58 |
+
"required": ["score", "justification"]
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
"required": ["Novelty", "Usefulness_Feasibility", "Flexibility", "Elaboration", "Cultural_Appropriateness"]
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
def extract_json(text: str) -> dict:
|
| 65 |
"""Robustly extracts JSON from text."""
|
| 66 |
try:
|
|
|
|
| 67 |
clean_text = text.strip()
|
| 68 |
if "```json" in clean_text:
|
| 69 |
clean_text = clean_text.split("```json")[1].split("```")[0].strip()
|
|
|
|
| 71 |
clean_text = clean_text.split("```")[1].split("```")[0].strip()
|
| 72 |
return json.loads(clean_text)
|
| 73 |
except (json.JSONDecodeError, IndexError):
|
|
|
|
| 74 |
try:
|
|
|
|
|
|
|
| 75 |
match = re.search(r'(\{[\s\S]*\})', text)
|
| 76 |
if match:
|
| 77 |
return json.loads(match.group(1))
|
|
|
|
| 89 |
|
| 90 |
async def evaluate(self, problem: str, solution_text: str) -> dict:
|
| 91 |
print(f"Evaluating solution (live): {solution_text[:50]}...")
|
| 92 |
+
|
| 93 |
+
# 1. Base Prompt from the clean text file
|
| 94 |
+
base_prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
|
| 95 |
+
|
| 96 |
+
# 2. INJECT STRONG INSTRUCTION
|
| 97 |
+
# This prevents the model from regurgitating examples found in the prompt file
|
| 98 |
+
schema_instruction = """
|
| 99 |
+
|
| 100 |
+
[IMPORTANT SYSTEM INSTRUCTION]
|
| 101 |
+
Ignore any previous examples of JSON formatting in this prompt.
|
| 102 |
+
You MUST strictly follow the Output Schema provided below.
|
| 103 |
+
|
| 104 |
+
For EACH of the 5 metrics (Novelty, Usefulness_Feasibility, etc.), you must provide an object with TWO fields:
|
| 105 |
+
1. "score": An integer from 1 to 5.
|
| 106 |
+
2. "justification": A specific sentence explaining why you gave that score.
|
| 107 |
+
|
| 108 |
+
Do not output a list. Return a single JSON object describing the solution above.
|
| 109 |
+
"""
|
| 110 |
+
|
| 111 |
+
final_prompt = base_prompt + schema_instruction
|
| 112 |
|
| 113 |
try:
|
| 114 |
+
# --- ENFORCE SCHEMA ---
|
| 115 |
response = await self.gemini_model.generate_content_async(
|
| 116 |
+
final_prompt,
|
| 117 |
generation_config=genai.types.GenerationConfig(
|
| 118 |
+
response_mime_type="application/json",
|
| 119 |
+
response_schema=EVALUATION_SCHEMA
|
| 120 |
)
|
| 121 |
)
|
| 122 |
|
|
|
|
| 146 |
self.sponsor_llms = list(self.api_clients.keys())
|
| 147 |
print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
|
| 148 |
|
| 149 |
+
# --- UPDATED: Return detailed results for logging ---
|
| 150 |
async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]]]:
|
| 151 |
print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
|
| 152 |
error_log = []
|
| 153 |
+
detailed_results = [] # To capture full calibration data
|
| 154 |
+
|
| 155 |
if not self.sponsor_llms:
|
| 156 |
raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
|
| 157 |
|
|
|
|
| 177 |
tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
|
| 178 |
|
| 179 |
results = await asyncio.gather(*tasks)
|
| 180 |
+
detailed_results = results # Store the full results here
|
| 181 |
+
|
| 182 |
best_llms = {}
|
| 183 |
role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
|
| 184 |
|
|
|
|
| 222 |
"Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
|
| 223 |
}
|
| 224 |
print(f"Calibration complete (live). Team plan: {team_plan}")
|
| 225 |
+
|
| 226 |
+
# Return 3 values: The plan, errors, and the full trace
|
| 227 |
+
return team_plan, error_log, detailed_results
|
| 228 |
|
| 229 |
async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
|
| 230 |
print(f"...Calibrating {role} on {llm_name}...")
|