dror44 commited on
Commit
45a014d
·
1 Parent(s): 2ea847d

added qualifire to the mix

Browse files
Files changed (6) hide show
  1. README.md +1 -0
  2. data/history.csv +15 -1
  3. data/leaderboard.csv +5 -5
  4. requirements.txt +1 -0
  5. src/app.py +22 -0
  6. src/judge.py +182 -1
README.md CHANGED
@@ -50,6 +50,7 @@ EvalArena/
50
  ```
51
  OPENAI_API_KEY=your_key_here
52
  ANTHROPIC_API_KEY=your_key_here
 
53
  ```
54
 
55
  ## Running
 
50
  ```
51
  OPENAI_API_KEY=your_key_here
52
  ANTHROPIC_API_KEY=your_key_here
53
+ QUALIFIRE_API_KEY=your_qualifire_key_here
54
  ```
55
 
56
  ## Running
data/history.csv CHANGED
@@ -57,4 +57,18 @@ LABEL: GROUNDED
57
  CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
58
 
59
  LABEL: GROUNDED
60
- CONFIDENCE: 95",none,,0.44117021560668945,7.508124828338623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
58
 
59
  LABEL: GROUNDED
60
+ CONFIDENCE: 95",none,,0.4411702156066894,7.508124828338623
61
+ 2025-04-26T18:55:42.278812,All applications for the Community Development Grant must be submitted online through the city portal. The submission window opens on October 1st and closes promptly on November 15th at 5:00 PM Eastern Time. The application requires a 5-page detailed project proposal and a separate budget spreadsheet.,The application requires a 10-page detailed project proposal.,claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.20 seconds
62
+
63
+ LABEL: UNGROUNDED
64
+ CONFIDENCE: 100",meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,"Evaluation time: 1.15 seconds
65
+
66
+ LABEL: UNGROUNDED
67
+ CONFIDENCE: 100",claude-3-opus-latest,,1.1986050605773926,1.1471669673919678
68
+ 2025-04-26T18:56:39.750327,A pilot telehealth program was launched to improve access to primary care services for patients in rural areas. Initial results show that approximately 60% of routine follow-up appointments can be effectively managed via telehealth. Certain complex cases and initial diagnoses still require in-person consultations.,"Implementing a new telehealth program will completely eliminate the need for in-person doctor visits for all patients, regardless of their medical condition or age.",claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.37 seconds
69
+
70
+ LABEL: UNGROUNDED
71
+ CONFIDENCE: 95",claude-3-5-sonnet-latest,Claude 3.5 Sonnet,"Evaluation time: 36.14 seconds
72
+
73
+ LABEL: UNGROUNDED
74
+ CONFIDENCE: 95",claude-3-opus-latest,,1.3739988803863525,36.14444375038147
data/leaderboard.csv CHANGED
@@ -1,6 +1,6 @@
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
2
  gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
3
- claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
5
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
6
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
@@ -8,24 +8,24 @@ claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic
8
  meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
9
  gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
10
  claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
 
11
  judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
12
  judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
13
  judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
14
  gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
15
- qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
16
- atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
17
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
18
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
19
  o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
20
  judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
21
- meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
22
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
23
  gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
24
  deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
25
  deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
26
- claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
27
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
28
  meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
 
29
  gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
30
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
31
  gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
 
1
  judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
2
  gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
3
+ claude-3-opus-latest,Claude 3 Opus,1559.6185917879666,4.0,0.0,4.0,Anthropic,Proprietary
4
  claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
5
  mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
6
  qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
 
8
  meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
9
  gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
10
  claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
11
+ qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
12
  judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
13
  judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
14
  judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
15
  gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
 
 
16
  judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
17
  meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
18
  o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
19
  judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
20
+ atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
21
  meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
22
  gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
23
  deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
24
  deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
25
+ meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1484.765265966291,1.0,2.0,3.0,Meta,Open Source
26
  mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
27
  meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
28
+ claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1470.8460024310996,0.0,2.0,2.0,Anthropic,Proprietary
29
  gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
30
  qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
31
  gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
requirements.txt CHANGED
@@ -5,4 +5,5 @@ loguru>=0.7.0
5
  numpy>=1.24.0
6
  pandas>=2.0.0
7
  python-dotenv>=1.0.0
 
8
  together>=0.1.5
 
5
  numpy>=1.24.0
6
  pandas>=2.0.0
7
  python-dotenv>=1.0.0
8
+ qualifire>=0.6.7
9
  together>=0.1.5
src/app.py CHANGED
@@ -184,6 +184,28 @@ def submit_example(
184
  gr.update(visible=False),
185
  )
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # Show loading messages while evaluations are in progress
188
  status_text = "Evaluations starting... Both judges will evaluate in parallel."
189
  return (
 
184
  gr.update(visible=False),
185
  )
186
 
187
+ # Format inputs for Qualifire evaluation
188
+ input_text, output_text = format_inputs_for_evaluation(
189
+ text_input,
190
+ claim_input,
191
+ single_text_input,
192
+ policy_input,
193
+ policy_output,
194
+ policy_assertion,
195
+ test_type,
196
+ )
197
+
198
+ # Get a single Qualifire evaluation to be shared by both judges
199
+ qualifire_result = judge_manager.evaluate_with_qualifire(
200
+ input_text,
201
+ output_text,
202
+ test_type,
203
+ )
204
+ logger.info("Completed Qualifire evaluation")
205
+
206
+ # Store the Qualifire result for both judges to use
207
+ judge_manager.shared_qualifire_result = qualifire_result
208
+
209
  # Show loading messages while evaluations are in progress
210
  status_text = "Evaluations starting... Both judges will evaluate in parallel."
211
  return (
src/judge.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import random
2
  import time
3
  from typing import Any, Dict, List, Tuple
@@ -5,6 +6,7 @@ from typing import Any, Dict, List, Tuple
5
  # Add litellm configuration to handle unsupported parameters
6
  import litellm
7
  import pandas as pd
 
8
  from litellm import completion
9
  from loguru import logger
10
  from together import Together
@@ -21,6 +23,12 @@ class JudgeManager:
21
  self.judges = judges
22
  self.leaderboard_df = self._init_leaderboard()
23
  self.together_client = Together()
 
 
 
 
 
 
24
 
25
  def _init_leaderboard(self) -> pd.DataFrame:
26
  """Initialize or load the leaderboard dataframe"""
@@ -87,11 +95,18 @@ class JudgeManager:
87
  # Start timing
88
  start_time = time.time()
89
 
 
 
 
90
  # Create appropriate system prompt based on test type
91
  system_prompt = self._get_system_prompt(test_type)
92
 
93
  # Format user message with input and output
94
- user_message = self._create_user_message(input_text, output_text, test_type)
 
 
 
 
95
 
96
  # Set temperature based on model
97
  temperature = 0.2
@@ -130,6 +145,10 @@ class JudgeManager:
130
  # Format the final evaluation with timing info
131
  evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
132
 
 
 
 
 
133
  # Format the evaluation - store the judge info but don't display it yet
134
  anonymous_eval = evaluation
135
 
@@ -145,6 +164,7 @@ class JudgeManager:
145
  "elapsed_time": elapsed_time,
146
  "input_text": input_text,
147
  "output_text": output_text,
 
148
  }
149
 
150
  except Exception as e:
@@ -172,6 +192,7 @@ class JudgeManager:
172
  "elapsed_time": elapsed_time,
173
  "input_text": input_text,
174
  "output_text": output_text,
 
175
  }
176
 
177
  def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
@@ -470,3 +491,163 @@ or
470
 
471
  LABEL: POOR_RESPONSE
472
  CONFIDENCE: 72"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import random
3
  import time
4
  from typing import Any, Dict, List, Tuple
 
6
  # Add litellm configuration to handle unsupported parameters
7
  import litellm
8
  import pandas as pd
9
+ import qualifire
10
  from litellm import completion
11
  from loguru import logger
12
  from together import Together
 
23
  self.judges = judges
24
  self.leaderboard_df = self._init_leaderboard()
25
  self.together_client = Together()
26
+ # Initialize Qualifire client with API key from environment variables
27
+ self.qualifire_client = qualifire.client.Client(
28
+ api_key=os.environ.get("QUALIFIRE_API_KEY", ""),
29
+ )
30
+ # Store shared Qualifire evaluation results
31
+ self.shared_qualifire_result = ""
32
 
33
  def _init_leaderboard(self) -> pd.DataFrame:
34
  """Initialize or load the leaderboard dataframe"""
 
95
  # Start timing
96
  start_time = time.time()
97
 
98
+ # Use shared Qualifire result instead of calling for each judge
99
+ qualifire_result = self.shared_qualifire_result
100
+
101
  # Create appropriate system prompt based on test type
102
  system_prompt = self._get_system_prompt(test_type)
103
 
104
  # Format user message with input and output
105
+ user_message = self._create_user_message(
106
+ input_text,
107
+ output_text,
108
+ test_type,
109
+ )
110
 
111
  # Set temperature based on model
112
  temperature = 0.2
 
145
  # Format the final evaluation with timing info
146
  evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
147
 
148
+ # Add Qualifire result if available
149
+ if qualifire_result:
150
+ evaluation += f"\n\nQualifire evaluation:\n{qualifire_result}"
151
+
152
  # Format the evaluation - store the judge info but don't display it yet
153
  anonymous_eval = evaluation
154
 
 
164
  "elapsed_time": elapsed_time,
165
  "input_text": input_text,
166
  "output_text": output_text,
167
+ "qualifire_result": qualifire_result,
168
  }
169
 
170
  except Exception as e:
 
192
  "elapsed_time": elapsed_time,
193
  "input_text": input_text,
194
  "output_text": output_text,
195
+ "qualifire_result": None,
196
  }
197
 
198
  def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
 
491
 
492
  LABEL: POOR_RESPONSE
493
  CONFIDENCE: 72"""
494
+
495
+ def evaluate_with_qualifire(
496
+ self,
497
+ input_text: str,
498
+ output_text: str,
499
+ test_type: str,
500
+ ) -> str:
501
+ """Call Qualifire API with appropriate parameters based on test type.
502
+ This is a standalone method to be called once per evaluation."""
503
+ try:
504
+ # Skip Qualifire if API key is not set
505
+ if not os.environ.get("QUALIFIRE_API_KEY"):
506
+ logger.warning(
507
+ "QUALIFIRE_API_KEY not set, skipping Qualifire evaluation",
508
+ )
509
+ return ""
510
+
511
+ # Map test types to Qualifire parameters
512
+ prompt_injections = test_type == "prompt injections"
513
+ grounding_check = test_type == "grounding"
514
+ safety_check = test_type == "safety"
515
+
516
+ # Extract assertions if available (from policy test type)
517
+ assertions = []
518
+ if test_type == "policy":
519
+ # First try structured format
520
+ for line in input_text.split("\n"):
521
+ if line.startswith("Assertion:"):
522
+ assertion = line[len("Assertion:") :].strip()
523
+ if assertion:
524
+ assertions = [assertion]
525
+ break
526
+
527
+ # If no assertion found, check for other formats
528
+ if not assertions and "Assertion:" in input_text:
529
+ assertion_parts = input_text.split("Assertion:")
530
+ if len(assertion_parts) > 1:
531
+ assertions = [assertion_parts[1].strip()]
532
+
533
+ # Log what we found
534
+ if assertions:
535
+ logger.info(f"Found policy assertion: {assertions[0]}")
536
+ else:
537
+ logger.warning("No policy assertion found in input")
538
+
539
+ # Call Qualifire API
540
+ logger.info(f"Calling Qualifire with test_type={test_type}, assertions={assertions}")
541
+
542
+ # Debug logs to help diagnose issues
543
+ logger.debug(f"Qualifire input: {input_text[:100]}...")
544
+ logger.debug(f"Qualifire output: {output_text[:100]}...")
545
+
546
+ result = self.qualifire_client.evaluate(
547
+ input=input_text,
548
+ output=output_text,
549
+ prompt_injections=prompt_injections,
550
+ grounding_check=grounding_check,
551
+ assertions=assertions,
552
+ dangerous_content_check=safety_check,
553
+ sexual_content_check=safety_check,
554
+ harassment_check=safety_check,
555
+ hate_speech_check=safety_check,
556
+ )
557
+
558
+ # Log response structure to help with debugging
559
+ if isinstance(result, dict):
560
+ logger.info(f"Qualifire response keys: {result.keys()}")
561
+ else:
562
+ logger.info(f"Qualifire response type: {type(result)}")
563
+
564
+ # Format the result for display
565
+ formatted_result = self._format_qualifire_result(result)
566
+ return formatted_result
567
+
568
+ except Exception as e:
569
+ logger.error(f"Error in Qualifire evaluation: {str(e)}")
570
+ import traceback
571
+
572
+ logger.error(f"Traceback: {traceback.format_exc()}")
573
+ return f"Qualifire evaluation error: {str(e)}"
574
+
575
+ def _format_qualifire_result(self, result) -> str:
576
+ """Format Qualifire result for display based on EvaluationResponse structure"""
577
+ if not result:
578
+ return ""
579
+
580
+ formatted = []
581
+
582
+ logger.info(f"Qualifire result type: {type(result)}")
583
+
584
+ try:
585
+ # Add overall score if available
586
+ if isinstance(result, dict) and "score" in result:
587
+ formatted.append(f"Overall score: {result['score']}/100")
588
+
589
+ # Process each evaluation result item
590
+ if isinstance(result, dict) and "evaluationResults" in result:
591
+ eval_results = result["evaluationResults"]
592
+
593
+ if not eval_results:
594
+ formatted.append("No specific evaluation results provided")
595
+ elif isinstance(eval_results, list):
596
+ for eval_item in eval_results:
597
+ if isinstance(eval_item, dict):
598
+ # Add the evaluation type if available
599
+ if "type" in eval_item:
600
+ formatted.append(f"\n--- {eval_item['type'].upper()} EVALUATION ---")
601
+
602
+ # Process results if available
603
+ if "results" in eval_item and isinstance(eval_item["results"], list):
604
+ for eval_result in eval_item["results"]:
605
+ if not isinstance(eval_result, dict):
606
+ continue
607
+
608
+ # Format the label
609
+ label = eval_result.get("label", "UNKNOWN")
610
+ name = eval_result.get("name", "Check")
611
+ formatted.append(f"- {name}: {label}")
612
+
613
+ # Add confidence score if available
614
+ if "confidence_score" in eval_result:
615
+ formatted.append(f" Confidence: {eval_result['confidence_score']}/100")
616
+
617
+ # Add reason if available
618
+ if "reason" in eval_result and eval_result["reason"]:
619
+ reason = str(eval_result["reason"]).replace("\n", " ")
620
+ if len(reason) > 100:
621
+ reason = reason[:97] + "..."
622
+ formatted.append(f" Reason: {reason}")
623
+
624
+ # Add quote if available
625
+ if "quote" in eval_result and eval_result["quote"]:
626
+ quote = str(eval_result["quote"])
627
+ if len(quote) > 50:
628
+ quote = quote[:47] + "..."
629
+ formatted.append(f' Quote: "{quote}"')
630
+ else:
631
+ # Handle unexpected item type
632
+ formatted.append(f"Unexpected evaluation item format: {type(eval_item)}")
633
+ else:
634
+ # Handle unexpected evaluationResults format
635
+ formatted.append(f"Unexpected evaluationResults format: {type(eval_results)}")
636
+
637
+ # Add status if available
638
+ if isinstance(result, dict) and "status" in result:
639
+ formatted.append(f"\nStatus: {result['status']}")
640
+
641
+ except Exception as e:
642
+ # Catch any formatting errors and return a simplified result
643
+ logger.error(f"Error formatting Qualifire result: {str(e)}")
644
+ import json
645
+
646
+ try:
647
+ # Try to return raw result as JSON string
648
+ return f"Qualifire raw result: {json.dumps(result, indent=2)}"
649
+ except Exception:
650
+ # If JSON serialization fails, return string representation
651
+ return f"Qualifire result: {str(result)}"
652
+
653
+ return "\n".join(formatted)