Spaces:

MCP-1st-Birthday
/

MudabbirAI

Running

App Files Files Community

youssefleb commited on 23 days ago

Commit

a4f27db

verified ·

1 Parent(s): 8f7bac8

Update mcp_servers.py

Browse files

Files changed (1) hide show

mcp_servers.py +64 -41

mcp_servers.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# mcp_servers.py (FIXED: Prompt-Schema Alignment + Detailed Logging)
 import asyncio
 import json
 import re
@@ -13,7 +13,6 @@ from personas import PERSONAS_DATA
 EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
 # --- DEFINING THE SCHEMA TO FORCE JUSTIFICATIONS ---
-# This forces the LLM to return a specific JSON structure with "score" and "justification"
 EVALUATION_SCHEMA = {
     "type": "OBJECT",
     "properties": {
@@ -87,16 +86,13 @@ class BusinessSolutionEvaluator:
         if "ERROR:" in EVALUATION_PROMPT_TEMPLATE:
             raise FileNotFoundError(EVALUATION_PROMPT_TEMPLATE)
-    async def evaluate(self, problem: str, solution_text: str) -> dict:
         print(f"Evaluating solution (live): {solution_text[:50]}...")
-        # 1. Base Prompt from the clean text file
         base_prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
-        # 2. INJECT STRONG INSTRUCTION
-        # This prevents the model from regurgitating examples found in the prompt file
         schema_instruction = """
         [IMPORTANT SYSTEM INSTRUCTION]
         Ignore any previous examples of JSON formatting in this prompt.
         You MUST strictly follow the Output Schema provided below.
@@ -109,9 +105,9 @@ class BusinessSolutionEvaluator:
         """
         final_prompt = base_prompt + schema_instruction
         try:
-            # --- ENFORCE SCHEMA ---
             response = await self.gemini_model.generate_content_async(
                 final_prompt,
                 generation_config=genai.types.GenerationConfig(
@@ -120,14 +116,18 @@ class BusinessSolutionEvaluator:
                 )
             )
             v_fitness = extract_json(response.text)
-            # Strict Type Checking
             if not isinstance(v_fitness, (dict, list)):
                 raise ValueError(f"Judge returned invalid type: {type(v_fitness)}")
             print(f"Evaluation complete (live): {v_fitness}")
-            return v_fitness
         except Exception as e:
             print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
@@ -137,7 +137,7 @@ class BusinessSolutionEvaluator:
                 "Flexibility": {"score": 1, "justification": f"Error: {str(e)}"},
                 "Elaboration": {"score": 1, "justification": f"Error: {str(e)}"},
                 "Cultural_Appropriateness": {"score": 1, "justification": f"Error: {str(e)}"}
-            }
 class AgentCalibrator:
     def __init__(self, api_clients: dict, evaluator: BusinessSolutionEvaluator):
@@ -146,11 +146,11 @@ class AgentCalibrator:
         self.sponsor_llms = list(self.api_clients.keys())
         print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
-    # --- UPDATED: Return detailed results for logging ---
-    async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]]]:
         print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
         error_log = []
-        detailed_results = [] # To capture full calibration data
         if not self.sponsor_llms:
              raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
@@ -163,7 +163,7 @@ class AgentCalibrator:
                 "Implementer": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Implementer"], "llm": default_llm},
                 "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": default_llm}
             }
-            return plan, error_log, []
         roles_to_test = {
             role: PERSONAS_DATA[key]["description"]
@@ -177,7 +177,12 @@ class AgentCalibrator:
                 tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
         results = await asyncio.gather(*tasks)
-        detailed_results = results # Store the full results here
         best_llms = {}
         role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
@@ -194,20 +199,12 @@ class AgentCalibrator:
                     # Robust Dict Access
                     raw_score_data = res.get("score", {})
-                    if not isinstance(raw_score_data, (dict, list)):
-                        raw_score_data = {}
-                    if isinstance(raw_score_data, list):
-                        raw_score_data = raw_score_data[0] if len(raw_score_data) > 0 else {}
                     metric_data = raw_score_data.get(metric, {})
-                    if not isinstance(metric_data, (dict, list)):
-                        metric_data = {}
-                    if isinstance(metric_data, list):
-                        metric_data = metric_data[0] if len(metric_data) > 0 else {}
                     score = metric_data.get("score", 0)
@@ -222,24 +219,34 @@ class AgentCalibrator:
             "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
         }
         print(f"Calibration complete (live). Team plan: {team_plan}")
-        # Return 3 values: The plan, errors, and the full trace
-        return team_plan, error_log, detailed_results
     async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
         print(f"...Calibrating {role} on {llm_name}...")
         client = self.api_clients[llm_name]
-        solution = await get_llm_response(llm_name, client, persona, test_problem)
         if "Error generating response" in solution:
-            return {"role": role, "llm": llm_name, "error": solution, "output": solution}
-        score = await self.evaluator.evaluate(problem, solution)
-        return {"role": role, "llm": llm_name, "score": score, "output": solution}
 # --- Unified API Call Function ---
-async def get_llm_response(client_name: str, client, system_prompt: str, user_prompt: str) -> str:
-    """A single function to handle calling any of the three sponsor LLMs."""
     try:
         if client_name == "Gemini":
             model = client
@@ -249,7 +256,13 @@ async def get_llm_response(client_name: str, client, system_prompt: str, user_pr
                 {'role': 'user', 'parts': [user_prompt]}
             ]
             response = await model.generate_content_async(full_prompt)
-            return response.text
         elif client_name == "Anthropic":
             response = await client.messages.create(
@@ -258,7 +271,12 @@ async def get_llm_response(client_name: str, client, system_prompt: str, user_pr
                 system=system_prompt,
                 messages=[{"role": "user", "content": user_prompt}]
             )
-            return response.content[0].text
         elif client_name == "SambaNova":
             completion = await client.chat.completions.create(
@@ -268,9 +286,14 @@ async def get_llm_response(client_name: str, client, system_prompt: str, user_pr
                     {"role": "user", "content": user_prompt}
                 ]
             )
-            return completion.choices[0].message.content
     except Exception as e:
         error_message = f"Error generating response from {client_name}: {str(e)}"
         print(f"ERROR: API call to {client_name} failed: {e}")
-        return error_message

+# mcp_servers.py (FIXED: Schema Enforcement + Detailed Logging + Usage Tracking)
 import asyncio
 import json
 import re
 EVALUATION_PROMPT_TEMPLATE = load_prompt(config.PROMPT_FILES["evaluator"])
 # --- DEFINING THE SCHEMA TO FORCE JUSTIFICATIONS ---
 EVALUATION_SCHEMA = {
     "type": "OBJECT",
     "properties": {
         if "ERROR:" in EVALUATION_PROMPT_TEMPLATE:
             raise FileNotFoundError(EVALUATION_PROMPT_TEMPLATE)
+    async def evaluate(self, problem: str, solution_text: str) -> Tuple[dict, dict]:
+        """Returns (evaluation_dict, usage_dict)"""
         print(f"Evaluating solution (live): {solution_text[:50]}...")
         base_prompt = EVALUATION_PROMPT_TEMPLATE.format(problem=problem, solution_text=solution_text)
         schema_instruction = """
         [IMPORTANT SYSTEM INSTRUCTION]
         Ignore any previous examples of JSON formatting in this prompt.
         You MUST strictly follow the Output Schema provided below.
         """
         final_prompt = base_prompt + schema_instruction
+        usage = {"model": "Gemini", "input": 0, "output": 0}
         try:
             response = await self.gemini_model.generate_content_async(
                 final_prompt,
                 generation_config=genai.types.GenerationConfig(
                 )
             )
+            # Capture Usage
+            if hasattr(response, "usage_metadata"):
+                usage["input"] = response.usage_metadata.prompt_token_count
+                usage["output"] = response.usage_metadata.candidates_token_count
             v_fitness = extract_json(response.text)
             if not isinstance(v_fitness, (dict, list)):
                 raise ValueError(f"Judge returned invalid type: {type(v_fitness)}")
             print(f"Evaluation complete (live): {v_fitness}")
+            return v_fitness, usage
         except Exception as e:
             print(f"ERROR: BusinessSolutionEvaluator failed: {e}")
                 "Flexibility": {"score": 1, "justification": f"Error: {str(e)}"},
                 "Elaboration": {"score": 1, "justification": f"Error: {str(e)}"},
                 "Cultural_Appropriateness": {"score": 1, "justification": f"Error: {str(e)}"}
+            }, usage
 class AgentCalibrator:
     def __init__(self, api_clients: dict, evaluator: BusinessSolutionEvaluator):
         self.sponsor_llms = list(self.api_clients.keys())
         print(f"AgentCalibrator initialized with enabled clients: {self.sponsor_llms}")
+    async def calibrate_team(self, problem: str) -> Tuple[Dict[str, Any], List[str], List[Dict[str, Any]], List[Dict[str, Any]]]:
         print(f"Running LIVE calibration test for specialist team on {self.sponsor_llms}...")
         error_log = []
+        detailed_results = []
+        all_usage_stats = [] # Collect all usage data here
         if not self.sponsor_llms:
              raise Exception("AgentCalibrator cannot run: No LLM clients are configured.")
                 "Implementer": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Implementer"], "llm": default_llm},
                 "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": default_llm}
             }
+            return plan, error_log, [], []
         roles_to_test = {
             role: PERSONAS_DATA[key]["description"]
                 tasks.append(self.run_calibration_test(problem, role, llm_name, persona, test_problem))
         results = await asyncio.gather(*tasks)
+        detailed_results = results
+        # Flatten results to extract usage
+        for res in results:
+            if "usage_gen" in res: all_usage_stats.append(res["usage_gen"])
+            if "usage_eval" in res: all_usage_stats.append(res["usage_eval"])
         best_llms = {}
         role_metrics = config.CALIBRATION_CONFIG["role_metrics"]
                     # Robust Dict Access
                     raw_score_data = res.get("score", {})
+                    if not isinstance(raw_score_data, (dict, list)): raw_score_data = {}
+                    if isinstance(raw_score_data, list): raw_score_data = raw_score_data[0] if len(raw_score_data) > 0 else {}
                     metric_data = raw_score_data.get(metric, {})
+                    if not isinstance(metric_data, (dict, list)): metric_data = {}
+                    if isinstance(metric_data, list): metric_data = metric_data[0] if len(metric_data) > 0 else {}
                     score = metric_data.get("score", 0)
             "Monitor": {"persona": config.CALIBRATION_CONFIG["roles_to_test"]["Monitor"], "llm": best_llms["Monitor"]}
         }
         print(f"Calibration complete (live). Team plan: {team_plan}")
+        return team_plan, error_log, detailed_results, all_usage_stats
     async def run_calibration_test(self, problem, role, llm_name, persona, test_problem):
         print(f"...Calibrating {role} on {llm_name}...")
         client = self.api_clients[llm_name]
+        # 1. Generate Solution (and get usage)
+        solution, gen_usage = await get_llm_response(llm_name, client, persona, test_problem)
         if "Error generating response" in solution:
+            return {"role": role, "llm": llm_name, "error": solution, "output": solution, "usage_gen": gen_usage}
+        # 2. Evaluate Solution (and get usage)
+        score, eval_usage = await self.evaluator.evaluate(problem, solution)
+        return {
+            "role": role,
+            "llm": llm_name,
+            "score": score,
+            "output": solution,
+            "usage_gen": gen_usage,
+            "usage_eval": eval_usage
+        }
 # --- Unified API Call Function ---
+async def get_llm_response(client_name: str, client, system_prompt: str, user_prompt: str) -> Tuple[str, dict]:
+    """Returns (text_response, usage_dict)"""
+    usage = {"model": client_name, "input": 0, "output": 0}
     try:
         if client_name == "Gemini":
             model = client
                 {'role': 'user', 'parts': [user_prompt]}
             ]
             response = await model.generate_content_async(full_prompt)
+            # Capture Gemini Usage
+            if hasattr(response, "usage_metadata"):
+                usage["input"] = response.usage_metadata.prompt_token_count
+                usage["output"] = response.usage_metadata.candidates_token_count
+            return response.text, usage
         elif client_name == "Anthropic":
             response = await client.messages.create(
                 system=system_prompt,
                 messages=[{"role": "user", "content": user_prompt}]
             )
+            # Capture Anthropic Usage
+            if hasattr(response, "usage"):
+                usage["input"] = response.usage.input_tokens
+                usage["output"] = response.usage.output_tokens
+            return response.content[0].text, usage
         elif client_name == "SambaNova":
             completion = await client.chat.completions.create(
                     {"role": "user", "content": user_prompt}
                 ]
             )
+            # Capture SambaNova Usage
+            if hasattr(completion, "usage"):
+                usage["input"] = completion.usage.prompt_tokens
+                usage["output"] = completion.usage.completion_tokens
+            return completion.choices[0].message.content, usage
     except Exception as e:
         error_message = f"Error generating response from {client_name}: {str(e)}"
         print(f"ERROR: API call to {client_name} failed: {e}")
+        return error_message, usage