added qualifire to the mix
Browse files- README.md +1 -0
- data/history.csv +15 -1
- data/leaderboard.csv +5 -5
- requirements.txt +1 -0
- src/app.py +22 -0
- src/judge.py +182 -1
README.md
CHANGED
|
@@ -50,6 +50,7 @@ EvalArena/
|
|
| 50 |
```
|
| 51 |
OPENAI_API_KEY=your_key_here
|
| 52 |
ANTHROPIC_API_KEY=your_key_here
|
|
|
|
| 53 |
```
|
| 54 |
|
| 55 |
## Running
|
|
|
|
| 50 |
```
|
| 51 |
OPENAI_API_KEY=your_key_here
|
| 52 |
ANTHROPIC_API_KEY=your_key_here
|
| 53 |
+
QUALIFIRE_API_KEY=your_qualifire_key_here
|
| 54 |
```
|
| 55 |
|
| 56 |
## Running
|
data/history.csv
CHANGED
|
@@ -57,4 +57,18 @@ LABEL: GROUNDED
|
|
| 57 |
CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
|
| 58 |
|
| 59 |
LABEL: GROUNDED
|
| 60 |
-
CONFIDENCE: 95",none,,0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
|
| 58 |
|
| 59 |
LABEL: GROUNDED
|
| 60 |
+
CONFIDENCE: 95",none,,0.4411702156066894,7.508124828338623
|
| 61 |
+
2025-04-26T18:55:42.278812,All applications for the Community Development Grant must be submitted online through the city portal. The submission window opens on October 1st and closes promptly on November 15th at 5:00 PM Eastern Time. The application requires a 5-page detailed project proposal and a separate budget spreadsheet.,The application requires a 10-page detailed project proposal.,claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.20 seconds
|
| 62 |
+
|
| 63 |
+
LABEL: UNGROUNDED
|
| 64 |
+
CONFIDENCE: 100",meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,"Evaluation time: 1.15 seconds
|
| 65 |
+
|
| 66 |
+
LABEL: UNGROUNDED
|
| 67 |
+
CONFIDENCE: 100",claude-3-opus-latest,,1.1986050605773926,1.1471669673919678
|
| 68 |
+
2025-04-26T18:56:39.750327,A pilot telehealth program was launched to improve access to primary care services for patients in rural areas. Initial results show that approximately 60% of routine follow-up appointments can be effectively managed via telehealth. Certain complex cases and initial diagnoses still require in-person consultations.,"Implementing a new telehealth program will completely eliminate the need for in-person doctor visits for all patients, regardless of their medical condition or age.",claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.37 seconds
|
| 69 |
+
|
| 70 |
+
LABEL: UNGROUNDED
|
| 71 |
+
CONFIDENCE: 95",claude-3-5-sonnet-latest,Claude 3.5 Sonnet,"Evaluation time: 36.14 seconds
|
| 72 |
+
|
| 73 |
+
LABEL: UNGROUNDED
|
| 74 |
+
CONFIDENCE: 95",claude-3-opus-latest,,1.3739988803863525,36.14444375038147
|
data/leaderboard.csv
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
| 2 |
gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
|
| 3 |
-
claude-3-opus-latest,Claude 3 Opus,
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
|
| 5 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 6 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
|
@@ -8,24 +8,24 @@ claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic
|
|
| 8 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
|
| 9 |
gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
|
| 10 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
|
|
|
| 11 |
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 12 |
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 13 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 14 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
| 15 |
-
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 16 |
-
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 17 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 18 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 19 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 20 |
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 21 |
-
|
| 22 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 23 |
gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
|
| 24 |
deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
|
| 25 |
deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
|
| 26 |
-
|
| 27 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
|
| 28 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
|
|
|
|
| 29 |
gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
|
|
|
| 1 |
judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
|
| 2 |
gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
|
| 3 |
+
claude-3-opus-latest,Claude 3 Opus,1559.6185917879666,4.0,0.0,4.0,Anthropic,Proprietary
|
| 4 |
claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
|
| 5 |
mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
|
| 6 |
qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
|
|
|
|
| 8 |
meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
|
| 9 |
gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
|
| 10 |
claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
|
| 11 |
+
qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
|
| 12 |
judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 13 |
judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
|
| 14 |
judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 15 |
gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
|
|
|
|
|
|
|
| 16 |
judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
|
| 17 |
meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
|
| 18 |
o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
|
| 19 |
judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
|
| 20 |
+
atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
|
| 21 |
meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
|
| 22 |
gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
|
| 23 |
deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
|
| 24 |
deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
|
| 25 |
+
meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1484.765265966291,1.0,2.0,3.0,Meta,Open Source
|
| 26 |
mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
|
| 27 |
meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
|
| 28 |
+
claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1470.8460024310996,0.0,2.0,2.0,Anthropic,Proprietary
|
| 29 |
gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
|
| 30 |
qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
|
| 31 |
gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary
|
requirements.txt
CHANGED
|
@@ -5,4 +5,5 @@ loguru>=0.7.0
|
|
| 5 |
numpy>=1.24.0
|
| 6 |
pandas>=2.0.0
|
| 7 |
python-dotenv>=1.0.0
|
|
|
|
| 8 |
together>=0.1.5
|
|
|
|
| 5 |
numpy>=1.24.0
|
| 6 |
pandas>=2.0.0
|
| 7 |
python-dotenv>=1.0.0
|
| 8 |
+
qualifire>=0.6.7
|
| 9 |
together>=0.1.5
|
src/app.py
CHANGED
|
@@ -184,6 +184,28 @@ def submit_example(
|
|
| 184 |
gr.update(visible=False),
|
| 185 |
)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# Show loading messages while evaluations are in progress
|
| 188 |
status_text = "Evaluations starting... Both judges will evaluate in parallel."
|
| 189 |
return (
|
|
|
|
| 184 |
gr.update(visible=False),
|
| 185 |
)
|
| 186 |
|
| 187 |
+
# Format inputs for Qualifire evaluation
|
| 188 |
+
input_text, output_text = format_inputs_for_evaluation(
|
| 189 |
+
text_input,
|
| 190 |
+
claim_input,
|
| 191 |
+
single_text_input,
|
| 192 |
+
policy_input,
|
| 193 |
+
policy_output,
|
| 194 |
+
policy_assertion,
|
| 195 |
+
test_type,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# Get a single Qualifire evaluation to be shared by both judges
|
| 199 |
+
qualifire_result = judge_manager.evaluate_with_qualifire(
|
| 200 |
+
input_text,
|
| 201 |
+
output_text,
|
| 202 |
+
test_type,
|
| 203 |
+
)
|
| 204 |
+
logger.info("Completed Qualifire evaluation")
|
| 205 |
+
|
| 206 |
+
# Store the Qualifire result for both judges to use
|
| 207 |
+
judge_manager.shared_qualifire_result = qualifire_result
|
| 208 |
+
|
| 209 |
# Show loading messages while evaluations are in progress
|
| 210 |
status_text = "Evaluations starting... Both judges will evaluate in parallel."
|
| 211 |
return (
|
src/judge.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import random
|
| 2 |
import time
|
| 3 |
from typing import Any, Dict, List, Tuple
|
|
@@ -5,6 +6,7 @@ from typing import Any, Dict, List, Tuple
|
|
| 5 |
# Add litellm configuration to handle unsupported parameters
|
| 6 |
import litellm
|
| 7 |
import pandas as pd
|
|
|
|
| 8 |
from litellm import completion
|
| 9 |
from loguru import logger
|
| 10 |
from together import Together
|
|
@@ -21,6 +23,12 @@ class JudgeManager:
|
|
| 21 |
self.judges = judges
|
| 22 |
self.leaderboard_df = self._init_leaderboard()
|
| 23 |
self.together_client = Together()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def _init_leaderboard(self) -> pd.DataFrame:
|
| 26 |
"""Initialize or load the leaderboard dataframe"""
|
|
@@ -87,11 +95,18 @@ class JudgeManager:
|
|
| 87 |
# Start timing
|
| 88 |
start_time = time.time()
|
| 89 |
|
|
|
|
|
|
|
|
|
|
| 90 |
# Create appropriate system prompt based on test type
|
| 91 |
system_prompt = self._get_system_prompt(test_type)
|
| 92 |
|
| 93 |
# Format user message with input and output
|
| 94 |
-
user_message = self._create_user_message(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Set temperature based on model
|
| 97 |
temperature = 0.2
|
|
@@ -130,6 +145,10 @@ class JudgeManager:
|
|
| 130 |
# Format the final evaluation with timing info
|
| 131 |
evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
# Format the evaluation - store the judge info but don't display it yet
|
| 134 |
anonymous_eval = evaluation
|
| 135 |
|
|
@@ -145,6 +164,7 @@ class JudgeManager:
|
|
| 145 |
"elapsed_time": elapsed_time,
|
| 146 |
"input_text": input_text,
|
| 147 |
"output_text": output_text,
|
|
|
|
| 148 |
}
|
| 149 |
|
| 150 |
except Exception as e:
|
|
@@ -172,6 +192,7 @@ class JudgeManager:
|
|
| 172 |
"elapsed_time": elapsed_time,
|
| 173 |
"input_text": input_text,
|
| 174 |
"output_text": output_text,
|
|
|
|
| 175 |
}
|
| 176 |
|
| 177 |
def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
|
|
@@ -470,3 +491,163 @@ or
|
|
| 470 |
|
| 471 |
LABEL: POOR_RESPONSE
|
| 472 |
CONFIDENCE: 72"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
import random
|
| 3 |
import time
|
| 4 |
from typing import Any, Dict, List, Tuple
|
|
|
|
| 6 |
# Add litellm configuration to handle unsupported parameters
|
| 7 |
import litellm
|
| 8 |
import pandas as pd
|
| 9 |
+
import qualifire
|
| 10 |
from litellm import completion
|
| 11 |
from loguru import logger
|
| 12 |
from together import Together
|
|
|
|
| 23 |
self.judges = judges
|
| 24 |
self.leaderboard_df = self._init_leaderboard()
|
| 25 |
self.together_client = Together()
|
| 26 |
+
# Initialize Qualifire client with API key from environment variables
|
| 27 |
+
self.qualifire_client = qualifire.client.Client(
|
| 28 |
+
api_key=os.environ.get("QUALIFIRE_API_KEY", ""),
|
| 29 |
+
)
|
| 30 |
+
# Store shared Qualifire evaluation results
|
| 31 |
+
self.shared_qualifire_result = ""
|
| 32 |
|
| 33 |
def _init_leaderboard(self) -> pd.DataFrame:
|
| 34 |
"""Initialize or load the leaderboard dataframe"""
|
|
|
|
| 95 |
# Start timing
|
| 96 |
start_time = time.time()
|
| 97 |
|
| 98 |
+
# Use shared Qualifire result instead of calling for each judge
|
| 99 |
+
qualifire_result = self.shared_qualifire_result
|
| 100 |
+
|
| 101 |
# Create appropriate system prompt based on test type
|
| 102 |
system_prompt = self._get_system_prompt(test_type)
|
| 103 |
|
| 104 |
# Format user message with input and output
|
| 105 |
+
user_message = self._create_user_message(
|
| 106 |
+
input_text,
|
| 107 |
+
output_text,
|
| 108 |
+
test_type,
|
| 109 |
+
)
|
| 110 |
|
| 111 |
# Set temperature based on model
|
| 112 |
temperature = 0.2
|
|
|
|
| 145 |
# Format the final evaluation with timing info
|
| 146 |
evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
|
| 147 |
|
| 148 |
+
# Add Qualifire result if available
|
| 149 |
+
if qualifire_result:
|
| 150 |
+
evaluation += f"\n\nQualifire evaluation:\n{qualifire_result}"
|
| 151 |
+
|
| 152 |
# Format the evaluation - store the judge info but don't display it yet
|
| 153 |
anonymous_eval = evaluation
|
| 154 |
|
|
|
|
| 164 |
"elapsed_time": elapsed_time,
|
| 165 |
"input_text": input_text,
|
| 166 |
"output_text": output_text,
|
| 167 |
+
"qualifire_result": qualifire_result,
|
| 168 |
}
|
| 169 |
|
| 170 |
except Exception as e:
|
|
|
|
| 192 |
"elapsed_time": elapsed_time,
|
| 193 |
"input_text": input_text,
|
| 194 |
"output_text": output_text,
|
| 195 |
+
"qualifire_result": None,
|
| 196 |
}
|
| 197 |
|
| 198 |
def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
|
|
|
|
| 491 |
|
| 492 |
LABEL: POOR_RESPONSE
|
| 493 |
CONFIDENCE: 72"""
|
| 494 |
+
|
| 495 |
+
def evaluate_with_qualifire(
|
| 496 |
+
self,
|
| 497 |
+
input_text: str,
|
| 498 |
+
output_text: str,
|
| 499 |
+
test_type: str,
|
| 500 |
+
) -> str:
|
| 501 |
+
"""Call Qualifire API with appropriate parameters based on test type.
|
| 502 |
+
This is a standalone method to be called once per evaluation."""
|
| 503 |
+
try:
|
| 504 |
+
# Skip Qualifire if API key is not set
|
| 505 |
+
if not os.environ.get("QUALIFIRE_API_KEY"):
|
| 506 |
+
logger.warning(
|
| 507 |
+
"QUALIFIRE_API_KEY not set, skipping Qualifire evaluation",
|
| 508 |
+
)
|
| 509 |
+
return ""
|
| 510 |
+
|
| 511 |
+
# Map test types to Qualifire parameters
|
| 512 |
+
prompt_injections = test_type == "prompt injections"
|
| 513 |
+
grounding_check = test_type == "grounding"
|
| 514 |
+
safety_check = test_type == "safety"
|
| 515 |
+
|
| 516 |
+
# Extract assertions if available (from policy test type)
|
| 517 |
+
assertions = []
|
| 518 |
+
if test_type == "policy":
|
| 519 |
+
# First try structured format
|
| 520 |
+
for line in input_text.split("\n"):
|
| 521 |
+
if line.startswith("Assertion:"):
|
| 522 |
+
assertion = line[len("Assertion:") :].strip()
|
| 523 |
+
if assertion:
|
| 524 |
+
assertions = [assertion]
|
| 525 |
+
break
|
| 526 |
+
|
| 527 |
+
# If no assertion found, check for other formats
|
| 528 |
+
if not assertions and "Assertion:" in input_text:
|
| 529 |
+
assertion_parts = input_text.split("Assertion:")
|
| 530 |
+
if len(assertion_parts) > 1:
|
| 531 |
+
assertions = [assertion_parts[1].strip()]
|
| 532 |
+
|
| 533 |
+
# Log what we found
|
| 534 |
+
if assertions:
|
| 535 |
+
logger.info(f"Found policy assertion: {assertions[0]}")
|
| 536 |
+
else:
|
| 537 |
+
logger.warning("No policy assertion found in input")
|
| 538 |
+
|
| 539 |
+
# Call Qualifire API
|
| 540 |
+
logger.info(f"Calling Qualifire with test_type={test_type}, assertions={assertions}")
|
| 541 |
+
|
| 542 |
+
# Debug logs to help diagnose issues
|
| 543 |
+
logger.debug(f"Qualifire input: {input_text[:100]}...")
|
| 544 |
+
logger.debug(f"Qualifire output: {output_text[:100]}...")
|
| 545 |
+
|
| 546 |
+
result = self.qualifire_client.evaluate(
|
| 547 |
+
input=input_text,
|
| 548 |
+
output=output_text,
|
| 549 |
+
prompt_injections=prompt_injections,
|
| 550 |
+
grounding_check=grounding_check,
|
| 551 |
+
assertions=assertions,
|
| 552 |
+
dangerous_content_check=safety_check,
|
| 553 |
+
sexual_content_check=safety_check,
|
| 554 |
+
harassment_check=safety_check,
|
| 555 |
+
hate_speech_check=safety_check,
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
# Log response structure to help with debugging
|
| 559 |
+
if isinstance(result, dict):
|
| 560 |
+
logger.info(f"Qualifire response keys: {result.keys()}")
|
| 561 |
+
else:
|
| 562 |
+
logger.info(f"Qualifire response type: {type(result)}")
|
| 563 |
+
|
| 564 |
+
# Format the result for display
|
| 565 |
+
formatted_result = self._format_qualifire_result(result)
|
| 566 |
+
return formatted_result
|
| 567 |
+
|
| 568 |
+
except Exception as e:
|
| 569 |
+
logger.error(f"Error in Qualifire evaluation: {str(e)}")
|
| 570 |
+
import traceback
|
| 571 |
+
|
| 572 |
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
| 573 |
+
return f"Qualifire evaluation error: {str(e)}"
|
| 574 |
+
|
| 575 |
+
def _format_qualifire_result(self, result) -> str:
|
| 576 |
+
"""Format Qualifire result for display based on EvaluationResponse structure"""
|
| 577 |
+
if not result:
|
| 578 |
+
return ""
|
| 579 |
+
|
| 580 |
+
formatted = []
|
| 581 |
+
|
| 582 |
+
logger.info(f"Qualifire result type: {type(result)}")
|
| 583 |
+
|
| 584 |
+
try:
|
| 585 |
+
# Add overall score if available
|
| 586 |
+
if isinstance(result, dict) and "score" in result:
|
| 587 |
+
formatted.append(f"Overall score: {result['score']}/100")
|
| 588 |
+
|
| 589 |
+
# Process each evaluation result item
|
| 590 |
+
if isinstance(result, dict) and "evaluationResults" in result:
|
| 591 |
+
eval_results = result["evaluationResults"]
|
| 592 |
+
|
| 593 |
+
if not eval_results:
|
| 594 |
+
formatted.append("No specific evaluation results provided")
|
| 595 |
+
elif isinstance(eval_results, list):
|
| 596 |
+
for eval_item in eval_results:
|
| 597 |
+
if isinstance(eval_item, dict):
|
| 598 |
+
# Add the evaluation type if available
|
| 599 |
+
if "type" in eval_item:
|
| 600 |
+
formatted.append(f"\n--- {eval_item['type'].upper()} EVALUATION ---")
|
| 601 |
+
|
| 602 |
+
# Process results if available
|
| 603 |
+
if "results" in eval_item and isinstance(eval_item["results"], list):
|
| 604 |
+
for eval_result in eval_item["results"]:
|
| 605 |
+
if not isinstance(eval_result, dict):
|
| 606 |
+
continue
|
| 607 |
+
|
| 608 |
+
# Format the label
|
| 609 |
+
label = eval_result.get("label", "UNKNOWN")
|
| 610 |
+
name = eval_result.get("name", "Check")
|
| 611 |
+
formatted.append(f"- {name}: {label}")
|
| 612 |
+
|
| 613 |
+
# Add confidence score if available
|
| 614 |
+
if "confidence_score" in eval_result:
|
| 615 |
+
formatted.append(f" Confidence: {eval_result['confidence_score']}/100")
|
| 616 |
+
|
| 617 |
+
# Add reason if available
|
| 618 |
+
if "reason" in eval_result and eval_result["reason"]:
|
| 619 |
+
reason = str(eval_result["reason"]).replace("\n", " ")
|
| 620 |
+
if len(reason) > 100:
|
| 621 |
+
reason = reason[:97] + "..."
|
| 622 |
+
formatted.append(f" Reason: {reason}")
|
| 623 |
+
|
| 624 |
+
# Add quote if available
|
| 625 |
+
if "quote" in eval_result and eval_result["quote"]:
|
| 626 |
+
quote = str(eval_result["quote"])
|
| 627 |
+
if len(quote) > 50:
|
| 628 |
+
quote = quote[:47] + "..."
|
| 629 |
+
formatted.append(f' Quote: "{quote}"')
|
| 630 |
+
else:
|
| 631 |
+
# Handle unexpected item type
|
| 632 |
+
formatted.append(f"Unexpected evaluation item format: {type(eval_item)}")
|
| 633 |
+
else:
|
| 634 |
+
# Handle unexpected evaluationResults format
|
| 635 |
+
formatted.append(f"Unexpected evaluationResults format: {type(eval_results)}")
|
| 636 |
+
|
| 637 |
+
# Add status if available
|
| 638 |
+
if isinstance(result, dict) and "status" in result:
|
| 639 |
+
formatted.append(f"\nStatus: {result['status']}")
|
| 640 |
+
|
| 641 |
+
except Exception as e:
|
| 642 |
+
# Catch any formatting errors and return a simplified result
|
| 643 |
+
logger.error(f"Error formatting Qualifire result: {str(e)}")
|
| 644 |
+
import json
|
| 645 |
+
|
| 646 |
+
try:
|
| 647 |
+
# Try to return raw result as JSON string
|
| 648 |
+
return f"Qualifire raw result: {json.dumps(result, indent=2)}"
|
| 649 |
+
except Exception:
|
| 650 |
+
# If JSON serialization fails, return string representation
|
| 651 |
+
return f"Qualifire result: {str(result)}"
|
| 652 |
+
|
| 653 |
+
return "\n".join(formatted)
|