EvalArena

Running

App Files Files Community

dror44 commited on Apr 26

Commit

45a014d

1 Parent(s): 2ea847d

added qualifire to the mix

Browse files

Files changed (6) hide show

README.md +1 -0
data/history.csv +15 -1
data/leaderboard.csv +5 -5
requirements.txt +1 -0
src/app.py +22 -0
src/judge.py +182 -1

README.md CHANGED Viewed

@@ -50,6 +50,7 @@ EvalArena/
    ```
    OPENAI_API_KEY=your_key_here
    ANTHROPIC_API_KEY=your_key_here
    ```
 ## Running

    ```
    OPENAI_API_KEY=your_key_here
    ANTHROPIC_API_KEY=your_key_here
+   QUALIFIRE_API_KEY=your_qualifire_key_here
    ```
 ## Running

data/history.csv CHANGED Viewed

@@ -57,4 +57,18 @@ LABEL: GROUNDED
 CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
 LABEL: GROUNDED
-CONFIDENCE: 95",none,,0.44117021560668945,7.508124828338623

 CONFIDENCE: 100",deepseek-r1,DeepSeek R1,"Evaluation time: 7.51 seconds
 LABEL: GROUNDED
+CONFIDENCE: 95",none,,0.4411702156066894,7.508124828338623
+2025-04-26T18:55:42.278812,All applications for the Community Development Grant must be submitted online through the city portal. The submission window opens on October 1st and closes promptly on November 15th at 5:00 PM Eastern Time. The application requires a 5-page detailed project proposal and a separate budget spreadsheet.,The application requires a 10-page detailed project proposal.,claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.20 seconds
+LABEL: UNGROUNDED
+CONFIDENCE: 100",meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,"Evaluation time: 1.15 seconds
+LABEL: UNGROUNDED
+CONFIDENCE: 100",claude-3-opus-latest,,1.1986050605773926,1.1471669673919678
+2025-04-26T18:56:39.750327,A pilot telehealth program was launched to improve access to primary care services for patients in rural areas. Initial results show that approximately 60% of routine follow-up appointments can be effectively managed via telehealth. Certain complex cases and initial diagnoses still require in-person consultations.,"Implementing a new telehealth program will completely eliminate the need for in-person doctor visits for all patients, regardless of their medical condition or age.",claude-3-opus-latest,Claude 3 Opus,"Evaluation time: 1.37 seconds
+LABEL: UNGROUNDED
+CONFIDENCE: 95",claude-3-5-sonnet-latest,Claude 3.5 Sonnet,"Evaluation time: 36.14 seconds
+LABEL: UNGROUNDED
+CONFIDENCE: 95",claude-3-opus-latest,,1.3739988803863525,36.14444375038147

data/leaderboard.csv CHANGED Viewed

@@ -1,6 +1,6 @@
 judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
 gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
-claude-3-opus-latest,Claude 3 Opus,1531.9661669788793,2.0,0.0,2.0,Anthropic,Proprietary
 claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
 mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
 qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
@@ -8,24 +8,24 @@ claude-3-sonnet-20240229,Claude 3 Sonnet,1515.263693206478,1.0,0.0,1.0,Anthropic
 meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
 gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
 claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
 judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
 judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
 judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
 gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
-qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
-atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
 judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
 meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
 o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
 judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
-meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1499.263693206478,1.0,1.0,2.0,Meta,Open Source
 meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
 gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
 deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
 deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
-claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1484.0,0.0,1.0,1.0,Anthropic,Proprietary
 mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
 meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
 gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
 qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
 gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary

 judge_id,judge_name,elo_score,wins,losses,total_evaluations,organization,license
 gemma-2-27b-it,Gemma 2 27B,1723.9484210232677,25.0,1.0,26.0,Google,Open Source
+claude-3-opus-latest,Claude 3 Opus,1559.6185917879666,4.0,0.0,4.0,Anthropic,Proprietary
 claude-3-5-haiku-latest,Claude 3.5 Haiku,1521.2089100627643,1.0,1.0,2.0,Anthropic,Proprietary
 mistral-7b-instruct-v0.1,Mistral (7B) Instruct v0.1,1516.736306793522,1.0,0.0,1.0,Mistral AI,Open Source
 qwen-2.5-7b-instruct-turbo,Qwen 2.5 7B Instruct,1516.0,1.0,0.0,1.0,Alibaba,Open Source
 meta-llama-3.3-70B-instruct-turbo,Meta Llama 4 Scout 32K Instruct,1511.8243832068688,1.0,1.0,2.0,Meta,Open Source
 gpt-4.1,GPT-4.1,1502.1692789932397,1.0,1.0,2.0,OpenAI,Proprietary
 claude-3-haiku-20240307,Claude 3 Haiku,1501.6053648908744,3.0,3.0,6.0,Anthropic,Proprietary
+qwen-2-72b-instruct,Qwen 2 Instruct (72B),1500.0,0.0,0.0,0.0,Alibaba,Open Source
 judge4,PrecisionJudge,1500.0,0.0,0.0,0.0,Anthropic,Commercial
 judge3,GradeAssist,1500.0,0.0,0.0,0.0,Anthropic,Commercial
 judge2,CritiqueBot,1500.0,0.0,0.0,0.0,OpenAI,Commercial
 gemma-2-9b-it,Gemma 2 9B,1500.0,0.0,0.0,0.0,Google,Open Source
 judge5,Mixtral,1500.0,0.0,0.0,0.0,Mistral AI,Commercial
 meta-llama-4-scout-17B-16E-instruct,Meta Llama 4 Scout 17B 16E Instruct,1500.0,0.0,0.0,0.0,Meta,Open Source
 o3-mini, o3-mini,1500.0,0.0,0.0,0.0,OpenAI,Proprietary
 judge1,EvalGPT,1500.0,0.0,0.0,0.0,OpenAI,Commercial
+atla-selene,Atla Selene,1500.0,0.0,0.0,0.0,Atla,Proprietary
 meta-llama-3.1-70b-instruct-turbo,Meta Llama 3.1 70B Instruct,1499.2598341210926,2.0,2.0,4.0,Meta,Open Source
 gpt-4-turbo,GPT-4 Turbo,1497.676800228027,1.0,2.0,3.0,OpenAI,Proprietary
 deepseek-v3,DeepSeek V3,1496.4838513726352,1.0,2.0,3.0,DeepSeek,Open Source
 deepseek-r1,DeepSeek R1,1495.8192027996802,0.0,1.0,1.0,DeepSeek,Open Source
+meta-llama-3.1-405b-instruct-turbo,Meta Llama 3.1 405B Instruct,1484.765265966291,1.0,2.0,3.0,Meta,Open Source
 mistral-7b-instruct-v0.3,Mistral (7B) Instruct v0.3,1481.2300851469852,0.0,4.0,4.0,Mistral AI,Open Source
 meta-llama-3.1-8b-instruct-turbo,Meta Llama 3.1 8B Instruct,1481.194128995395,1.0,2.0,3.0,Meta,Open Source
+claude-3-5-sonnet-latest,Claude 3.5 Sonnet,1470.8460024310996,0.0,2.0,2.0,Anthropic,Proprietary
 gpt-4o,GPT-4o,1466.0577517475272,0.0,3.0,3.0,OpenAI,Proprietary
 qwen-2.5-72b-instruct-turbo,Qwen 2.5 72B Instruct,1412.6552679185854,21.0,25.0,46.0,Alibaba,Open Source
 gpt-3.5-turbo,GPT-3.5 Turbo,1318.2061729482512,0.0,21.0,21.0,OpenAI,Proprietary

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ loguru>=0.7.0
 numpy>=1.24.0
 pandas>=2.0.0
 python-dotenv>=1.0.0
 together>=0.1.5

 numpy>=1.24.0
 pandas>=2.0.0
 python-dotenv>=1.0.0
+qualifire>=0.6.7
 together>=0.1.5

src/app.py CHANGED Viewed

@@ -184,6 +184,28 @@ def submit_example(
                 gr.update(visible=False),
             )
         # Show loading messages while evaluations are in progress
         status_text = "Evaluations starting... Both judges will evaluate in parallel."
         return (

                 gr.update(visible=False),
             )
+        # Format inputs for Qualifire evaluation
+        input_text, output_text = format_inputs_for_evaluation(
+            text_input,
+            claim_input,
+            single_text_input,
+            policy_input,
+            policy_output,
+            policy_assertion,
+            test_type,
+        )
+        # Get a single Qualifire evaluation to be shared by both judges
+        qualifire_result = judge_manager.evaluate_with_qualifire(
+            input_text,
+            output_text,
+            test_type,
+        )
+        logger.info("Completed Qualifire evaluation")
+        # Store the Qualifire result for both judges to use
+        judge_manager.shared_qualifire_result = qualifire_result
         # Show loading messages while evaluations are in progress
         status_text = "Evaluations starting... Both judges will evaluate in parallel."
         return (

src/judge.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import random
 import time
 from typing import Any, Dict, List, Tuple
@@ -5,6 +6,7 @@ from typing import Any, Dict, List, Tuple
 # Add litellm configuration to handle unsupported parameters
 import litellm
 import pandas as pd
 from litellm import completion
 from loguru import logger
 from together import Together
@@ -21,6 +23,12 @@ class JudgeManager:
         self.judges = judges
         self.leaderboard_df = self._init_leaderboard()
         self.together_client = Together()
     def _init_leaderboard(self) -> pd.DataFrame:
         """Initialize or load the leaderboard dataframe"""
@@ -87,11 +95,18 @@ class JudgeManager:
             # Start timing
             start_time = time.time()
             # Create appropriate system prompt based on test type
             system_prompt = self._get_system_prompt(test_type)
             # Format user message with input and output
-            user_message = self._create_user_message(input_text, output_text, test_type)
             # Set temperature based on model
             temperature = 0.2
@@ -130,6 +145,10 @@ class JudgeManager:
             # Format the final evaluation with timing info
             evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
             # Format the evaluation - store the judge info but don't display it yet
             anonymous_eval = evaluation
@@ -145,6 +164,7 @@ class JudgeManager:
                 "elapsed_time": elapsed_time,
                 "input_text": input_text,
                 "output_text": output_text,
             }
         except Exception as e:
@@ -172,6 +192,7 @@ class JudgeManager:
                 "elapsed_time": elapsed_time,
                 "input_text": input_text,
                 "output_text": output_text,
             }
     def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
@@ -470,3 +491,163 @@ or
 LABEL: POOR_RESPONSE
 CONFIDENCE: 72"""

+import os
 import random
 import time
 from typing import Any, Dict, List, Tuple
 # Add litellm configuration to handle unsupported parameters
 import litellm
 import pandas as pd
+import qualifire
 from litellm import completion
 from loguru import logger
 from together import Together
         self.judges = judges
         self.leaderboard_df = self._init_leaderboard()
         self.together_client = Together()
+        # Initialize Qualifire client with API key from environment variables
+        self.qualifire_client = qualifire.client.Client(
+            api_key=os.environ.get("QUALIFIRE_API_KEY", ""),
+        )
+        # Store shared Qualifire evaluation results
+        self.shared_qualifire_result = ""
     def _init_leaderboard(self) -> pd.DataFrame:
         """Initialize or load the leaderboard dataframe"""
             # Start timing
             start_time = time.time()
+            # Use shared Qualifire result instead of calling for each judge
+            qualifire_result = self.shared_qualifire_result
             # Create appropriate system prompt based on test type
             system_prompt = self._get_system_prompt(test_type)
             # Format user message with input and output
+            user_message = self._create_user_message(
+                input_text,
+                output_text,
+                test_type,
+            )
             # Set temperature based on model
             temperature = 0.2
             # Format the final evaluation with timing info
             evaluation = f"Evaluation time: {elapsed_time:.2f} seconds\n\n{parsed_evaluation}"
+            # Add Qualifire result if available
+            if qualifire_result:
+                evaluation += f"\n\nQualifire evaluation:\n{qualifire_result}"
             # Format the evaluation - store the judge info but don't display it yet
             anonymous_eval = evaluation
                 "elapsed_time": elapsed_time,
                 "input_text": input_text,
                 "output_text": output_text,
+                "qualifire_result": qualifire_result,
             }
         except Exception as e:
                 "elapsed_time": elapsed_time,
                 "input_text": input_text,
                 "output_text": output_text,
+                "qualifire_result": None,
             }
     def _create_user_message(self, input_text: str, output_text: str, test_type: str) -> str:
 LABEL: POOR_RESPONSE
 CONFIDENCE: 72"""
+    def evaluate_with_qualifire(
+        self,
+        input_text: str,
+        output_text: str,
+        test_type: str,
+    ) -> str:
+        """Call Qualifire API with appropriate parameters based on test type.
+        This is a standalone method to be called once per evaluation."""
+        try:
+            # Skip Qualifire if API key is not set
+            if not os.environ.get("QUALIFIRE_API_KEY"):
+                logger.warning(
+                    "QUALIFIRE_API_KEY not set, skipping Qualifire evaluation",
+                )
+                return ""
+            # Map test types to Qualifire parameters
+            prompt_injections = test_type == "prompt injections"
+            grounding_check = test_type == "grounding"
+            safety_check = test_type == "safety"
+            # Extract assertions if available (from policy test type)
+            assertions = []
+            if test_type == "policy":
+                # First try structured format
+                for line in input_text.split("\n"):
+                    if line.startswith("Assertion:"):
+                        assertion = line[len("Assertion:") :].strip()
+                        if assertion:
+                            assertions = [assertion]
+                            break
+                # If no assertion found, check for other formats
+                if not assertions and "Assertion:" in input_text:
+                    assertion_parts = input_text.split("Assertion:")
+                    if len(assertion_parts) > 1:
+                        assertions = [assertion_parts[1].strip()]
+                # Log what we found
+                if assertions:
+                    logger.info(f"Found policy assertion: {assertions[0]}")
+                else:
+                    logger.warning("No policy assertion found in input")
+            # Call Qualifire API
+            logger.info(f"Calling Qualifire with test_type={test_type}, assertions={assertions}")
+            # Debug logs to help diagnose issues
+            logger.debug(f"Qualifire input: {input_text[:100]}...")
+            logger.debug(f"Qualifire output: {output_text[:100]}...")
+            result = self.qualifire_client.evaluate(
+                input=input_text,
+                output=output_text,
+                prompt_injections=prompt_injections,
+                grounding_check=grounding_check,
+                assertions=assertions,
+                dangerous_content_check=safety_check,
+                sexual_content_check=safety_check,
+                harassment_check=safety_check,
+                hate_speech_check=safety_check,
+            )
+            # Log response structure to help with debugging
+            if isinstance(result, dict):
+                logger.info(f"Qualifire response keys: {result.keys()}")
+            else:
+                logger.info(f"Qualifire response type: {type(result)}")
+            # Format the result for display
+            formatted_result = self._format_qualifire_result(result)
+            return formatted_result
+        except Exception as e:
+            logger.error(f"Error in Qualifire evaluation: {str(e)}")
+            import traceback
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return f"Qualifire evaluation error: {str(e)}"
+    def _format_qualifire_result(self, result) -> str:
+        """Format Qualifire result for display based on EvaluationResponse structure"""
+        if not result:
+            return ""
+        formatted = []
+        logger.info(f"Qualifire result type: {type(result)}")
+        try:
+            # Add overall score if available
+            if isinstance(result, dict) and "score" in result:
+                formatted.append(f"Overall score: {result['score']}/100")
+            # Process each evaluation result item
+            if isinstance(result, dict) and "evaluationResults" in result:
+                eval_results = result["evaluationResults"]
+                if not eval_results:
+                    formatted.append("No specific evaluation results provided")
+                elif isinstance(eval_results, list):
+                    for eval_item in eval_results:
+                        if isinstance(eval_item, dict):
+                            # Add the evaluation type if available
+                            if "type" in eval_item:
+                                formatted.append(f"\n--- {eval_item['type'].upper()} EVALUATION ---")
+                            # Process results if available
+                            if "results" in eval_item and isinstance(eval_item["results"], list):
+                                for eval_result in eval_item["results"]:
+                                    if not isinstance(eval_result, dict):
+                                        continue
+                                    # Format the label
+                                    label = eval_result.get("label", "UNKNOWN")
+                                    name = eval_result.get("name", "Check")
+                                    formatted.append(f"- {name}: {label}")
+                                    # Add confidence score if available
+                                    if "confidence_score" in eval_result:
+                                        formatted.append(f"  Confidence: {eval_result['confidence_score']}/100")
+                                    # Add reason if available
+                                    if "reason" in eval_result and eval_result["reason"]:
+                                        reason = str(eval_result["reason"]).replace("\n", " ")
+                                        if len(reason) > 100:
+                                            reason = reason[:97] + "..."
+                                        formatted.append(f"  Reason: {reason}")
+                                    # Add quote if available
+                                    if "quote" in eval_result and eval_result["quote"]:
+                                        quote = str(eval_result["quote"])
+                                        if len(quote) > 50:
+                                            quote = quote[:47] + "..."
+                                        formatted.append(f'  Quote: "{quote}"')
+                        else:
+                            # Handle unexpected item type
+                            formatted.append(f"Unexpected evaluation item format: {type(eval_item)}")
+                else:
+                    # Handle unexpected evaluationResults format
+                    formatted.append(f"Unexpected evaluationResults format: {type(eval_results)}")
+            # Add status if available
+            if isinstance(result, dict) and "status" in result:
+                formatted.append(f"\nStatus: {result['status']}")
+        except Exception as e:
+            # Catch any formatting errors and return a simplified result
+            logger.error(f"Error formatting Qualifire result: {str(e)}")
+            import json
+            try:
+                # Try to return raw result as JSON string
+                return f"Qualifire raw result: {json.dumps(result, indent=2)}"
+            except Exception:
+                # If JSON serialization fails, return string representation
+                return f"Qualifire result: {str(result)}"
+        return "\n".join(formatted)