|
|
|
|
|
""" |
|
|
Minimal NER Benchmark Runner for HuggingFace Publication |
|
|
|
|
|
This script evaluates a NER model's performance on key metrics: |
|
|
- Entity Recognition F1 Score: How well entities are identified and classified |
|
|
- Precision: Accuracy of positive predictions |
|
|
- Recall: Ability to find all relevant entities |
|
|
- Latency: Response time performance |
|
|
- Entity Type Performance: Results across different entity types |
|
|
""" |
|
|
|
|
|
import json |
|
|
import re |
|
|
import time |
|
|
import requests |
|
|
from typing import Dict, List, Tuple, Any |
|
|
import yaml |
|
|
from datetime import datetime |
|
|
import sys |
|
|
import os |
|
|
|
|
|
class NERBenchmarkRunner: |
|
|
def __init__(self, config_path: str): |
|
|
with open(config_path, 'r') as f: |
|
|
self.config = yaml.safe_load(f) |
|
|
|
|
|
self.results = { |
|
|
"metadata": { |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"model": "Minibase-NER-Small", |
|
|
"dataset": self.config["datasets"]["benchmark_dataset"]["file_path"], |
|
|
"sample_size": self.config["datasets"]["benchmark_dataset"]["sample_size"] |
|
|
}, |
|
|
"metrics": {}, |
|
|
"entity_performance": {}, |
|
|
"examples": [] |
|
|
} |
|
|
|
|
|
def load_dataset(self) -> List[Dict]: |
|
|
"""Load and sample the benchmark dataset""" |
|
|
dataset_path = self.config["datasets"]["benchmark_dataset"]["file_path"] |
|
|
sample_size = self.config["datasets"]["benchmark_dataset"]["sample_size"] |
|
|
|
|
|
examples = [] |
|
|
try: |
|
|
with open(dataset_path, 'r') as f: |
|
|
for i, line in enumerate(f): |
|
|
if i >= sample_size: |
|
|
break |
|
|
examples.append(json.loads(line.strip())) |
|
|
except FileNotFoundError: |
|
|
print(f"⚠️ Dataset file {dataset_path} not found. Creating sample dataset...") |
|
|
examples = self.create_sample_dataset(sample_size) |
|
|
|
|
|
print(f"✅ Loaded {len(examples)} examples from {dataset_path}") |
|
|
return examples |
|
|
|
|
|
def create_sample_dataset(self, sample_size: int) -> List[Dict]: |
|
|
"""Create a sample NER dataset for testing""" |
|
|
examples = [ |
|
|
{ |
|
|
"instruction": "Extract all named entities from the following text. Return them in JSON format with entity types as keys and lists of entities as values.", |
|
|
"input": "John Smith works at Google in New York and uses Python programming language.", |
|
|
"response": '"PER": ["John Smith"], "ORG": ["Google"], "LOC": ["New York"], "MISC": ["Python"]' |
|
|
}, |
|
|
{ |
|
|
"instruction": "Extract all named entities from the following text. Return them in JSON format with entity types as keys and lists of entities as values.", |
|
|
"input": "Microsoft Corporation announced that Satya Nadella will visit London next week.", |
|
|
"response": '"PER": ["Satya Nadella"], "ORG": ["Microsoft Corporation"], "LOC": ["London"]' |
|
|
}, |
|
|
{ |
|
|
"instruction": "Extract all named entities from the following text. Return them in JSON format with entity types as keys and lists of entities as values.", |
|
|
"input": "The University of Cambridge is located in the United Kingdom and was founded by King Henry III.", |
|
|
"response": '"ORG": ["University of Cambridge"], "LOC": ["United Kingdom"], "PER": ["King Henry III"]' |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
dataset = [] |
|
|
for i in range(sample_size): |
|
|
dataset.append(examples[i % len(examples)].copy()) |
|
|
|
|
|
|
|
|
with open(self.config["datasets"]["benchmark_dataset"]["file_path"], 'w') as f: |
|
|
for example in dataset: |
|
|
f.write(json.dumps(example) + '\n') |
|
|
|
|
|
return dataset |
|
|
|
|
|
def extract_entities_from_prediction(self, prediction: str) -> List[Tuple[str, str, str]]: |
|
|
"""Extract entities from numbered list prediction format""" |
|
|
entities = [] |
|
|
|
|
|
|
|
|
prediction = prediction.strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lines = prediction.split('\n') |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
numbered_match = re.match(r'^\d+\.\s*(.+?)(?:\s*-\s*.+)?$', line) |
|
|
if numbered_match: |
|
|
entity_text = numbered_match.group(1).strip() |
|
|
|
|
|
entity_text = re.sub(r'[.,;:!?]$', '', entity_text).strip() |
|
|
|
|
|
if entity_text and len(entity_text) > 1 and not entity_text.lower() in ['the', 'and', 'or', 'but', 'for', 'with']: |
|
|
entities.append((entity_text, "ENTITY", "0-0")) |
|
|
else: |
|
|
|
|
|
if re.match(r'^\d+$', line): |
|
|
continue |
|
|
|
|
|
elif len(line) > 1: |
|
|
entity_text = line.strip() |
|
|
entity_text = re.sub(r'[.,;:!?]$', '', entity_text).strip() |
|
|
if entity_text: |
|
|
entities.append((entity_text, "ENTITY", "0-0")) |
|
|
|
|
|
return entities |
|
|
|
|
|
def extract_entities_from_bio_format(self, bio_text: str) -> List[Tuple[str, str, str]]: |
|
|
"""Extract entities from BIO format text""" |
|
|
entities = [] |
|
|
lines = bio_text.strip().split('\n') |
|
|
|
|
|
current_entity = None |
|
|
current_type = None |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line or line == '.': |
|
|
continue |
|
|
|
|
|
parts = line.split() |
|
|
if len(parts) >= 2: |
|
|
token, tag = parts[0], parts[1] |
|
|
|
|
|
if tag.startswith('B-'): |
|
|
|
|
|
if current_entity: |
|
|
entities.append((current_entity, current_type, "0-0")) |
|
|
|
|
|
current_entity = token |
|
|
current_type = tag[2:] |
|
|
elif tag.startswith('I-') and current_entity: |
|
|
|
|
|
current_entity += ' ' + token |
|
|
else: |
|
|
|
|
|
if current_entity: |
|
|
entities.append((current_entity, current_type, "0-0")) |
|
|
current_entity = None |
|
|
current_type = None |
|
|
|
|
|
|
|
|
if current_entity: |
|
|
entities.append((current_entity, current_type, "0-0")) |
|
|
|
|
|
return entities |
|
|
|
|
|
def normalize_entity_text(self, text: str) -> str: |
|
|
"""Normalize entity text for better matching""" |
|
|
|
|
|
text = text.lower() |
|
|
|
|
|
text = re.sub(r'^(the|an?|mr|mrs|ms|dr|prof)\s+', '', text) |
|
|
|
|
|
text = ' '.join(text.split()) |
|
|
return text.strip() |
|
|
|
|
|
def calculate_ner_metrics(self, predicted_entities: List[Tuple], expected_bio_text: str) -> Dict[str, float]: |
|
|
"""Calculate NER metrics: precision, recall, F1""" |
|
|
|
|
|
expected_entities = self.extract_entities_from_bio_format(expected_bio_text) |
|
|
|
|
|
|
|
|
pred_texts = set(self.normalize_entity_text(ent[0]) for ent in predicted_entities) |
|
|
exp_texts = set(self.normalize_entity_text(ent[0]) for ent in expected_entities) |
|
|
|
|
|
|
|
|
exact_matches = pred_texts & exp_texts |
|
|
true_positives = len(exact_matches) |
|
|
|
|
|
|
|
|
additional_matches = 0 |
|
|
for pred in pred_texts - exact_matches: |
|
|
for exp in exp_texts - exact_matches: |
|
|
|
|
|
if pred in exp or exp in pred: |
|
|
if len(pred) > 3 and len(exp) > 3: |
|
|
additional_matches += 1 |
|
|
break |
|
|
|
|
|
true_positives += additional_matches |
|
|
false_positives = len(pred_texts) - true_positives |
|
|
false_negatives = len(exp_texts) - true_positives |
|
|
|
|
|
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0 |
|
|
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0 |
|
|
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 |
|
|
|
|
|
return { |
|
|
"precision": precision, |
|
|
"recall": recall, |
|
|
"f1": f1, |
|
|
"true_positives": true_positives, |
|
|
"false_positives": false_positives, |
|
|
"false_negatives": false_negatives |
|
|
} |
|
|
|
|
|
def call_model(self, instruction: str, input_text: str) -> Tuple[str, float]: |
|
|
"""Call the NER model and measure latency""" |
|
|
prompt = f"{instruction}\n\nInput: {input_text}\n\nResponse: " |
|
|
|
|
|
payload = { |
|
|
"prompt": prompt, |
|
|
"max_tokens": self.config["model"]["max_tokens"], |
|
|
"temperature": self.config["model"]["temperature"] |
|
|
} |
|
|
|
|
|
headers = {'Content-Type': 'application/json'} |
|
|
|
|
|
start_time = time.time() |
|
|
try: |
|
|
response = requests.post( |
|
|
f"{self.config['model']['base_url']}/completion", |
|
|
json=payload, |
|
|
headers=headers, |
|
|
timeout=self.config["model"]["timeout"] |
|
|
) |
|
|
latency = (time.time() - start_time) * 1000 |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
return result.get('content', ''), latency |
|
|
else: |
|
|
return f"Error: Server returned status {response.status_code}", latency |
|
|
except requests.exceptions.RequestException as e: |
|
|
latency = (time.time() - start_time) * 1000 |
|
|
return f"Error: {e}", latency |
|
|
|
|
|
def run_benchmarks(self): |
|
|
"""Run the complete benchmark suite""" |
|
|
print("🚀 Starting NER Benchmarks...") |
|
|
print(f"📊 Sample size: {self.config['datasets']['benchmark_dataset']['sample_size']}") |
|
|
print(f"🎯 Model: {self.results['metadata']['model']}") |
|
|
print() |
|
|
|
|
|
|
|
|
print("🔧 Testing numbered list parsing with mock data...") |
|
|
|
|
|
mock_output = "1. Neil Armstrong\n2. Buzz Aldrin\n3. NASA\n4. Moon\n5. Apollo 11" |
|
|
|
|
|
print("Testing NER numbered list format:") |
|
|
mock_entities = self.extract_entities_from_prediction(mock_output) |
|
|
print(f"✅ Numbered list parsing: {len(mock_entities)} entities extracted") |
|
|
|
|
|
if mock_entities: |
|
|
print("Sample entities:") |
|
|
for entity in mock_entities: |
|
|
print(f" - {entity[0]} ({entity[1]})") |
|
|
print() |
|
|
|
|
|
examples = self.load_dataset() |
|
|
|
|
|
|
|
|
total_precision = 0 |
|
|
total_recall = 0 |
|
|
total_f1 = 0 |
|
|
total_latency = 0 |
|
|
entity_type_metrics = {} |
|
|
|
|
|
successful_requests = 0 |
|
|
|
|
|
for i, example in enumerate(examples): |
|
|
if i % 10 == 0: |
|
|
print(f"📈 Progress: {i}/{len(examples)} examples processed") |
|
|
|
|
|
instruction = example[self.config["datasets"]["benchmark_dataset"]["instruction_field"]] |
|
|
input_text = example[self.config["datasets"]["benchmark_dataset"]["input_field"]] |
|
|
expected_output = example[self.config["datasets"]["benchmark_dataset"]["expected_output_field"]] |
|
|
|
|
|
|
|
|
predicted_output, latency = self.call_model(instruction, input_text) |
|
|
|
|
|
if not predicted_output.startswith("Error"): |
|
|
successful_requests += 1 |
|
|
|
|
|
|
|
|
try: |
|
|
predicted_entities = self.extract_entities_from_prediction(predicted_output) |
|
|
|
|
|
|
|
|
metrics = self.calculate_ner_metrics(predicted_entities, expected_output) |
|
|
|
|
|
|
|
|
total_precision += metrics["precision"] |
|
|
total_recall += metrics["recall"] |
|
|
total_f1 += metrics["f1"] |
|
|
total_latency += latency |
|
|
|
|
|
|
|
|
for entity_text, entity_type, _ in predicted_entities: |
|
|
if entity_type not in entity_type_metrics: |
|
|
entity_type_metrics[entity_type] = {"correct": 0, "total": 0} |
|
|
|
|
|
|
|
|
expected_entities_list = self.extract_entities_from_bio_format(expected_output) |
|
|
expected_entity_texts = [self.normalize_entity_text(e[0]) for e in expected_entities_list] |
|
|
normalized_entity = self.normalize_entity_text(entity_text) |
|
|
|
|
|
|
|
|
is_correct = normalized_entity in expected_entity_texts |
|
|
if not is_correct: |
|
|
|
|
|
for exp_text in expected_entity_texts: |
|
|
if normalized_entity in exp_text or exp_text in normalized_entity: |
|
|
if len(normalized_entity) > 3 and len(exp_text) > 3: |
|
|
is_correct = True |
|
|
break |
|
|
|
|
|
if is_correct: |
|
|
entity_type_metrics[entity_type]["correct"] += 1 |
|
|
entity_type_metrics[entity_type]["total"] += 1 |
|
|
|
|
|
|
|
|
if len(self.results["examples"]) < self.config["output"]["max_examples"]: |
|
|
self.results["examples"].append({ |
|
|
"input": input_text, |
|
|
"expected": expected_output, |
|
|
"predicted": predicted_output, |
|
|
"metrics": metrics, |
|
|
"latency_ms": latency |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"⚠️ Error processing example {i}: {e}") |
|
|
continue |
|
|
|
|
|
|
|
|
if successful_requests > 0: |
|
|
self.results["metrics"] = { |
|
|
"precision": total_precision / successful_requests, |
|
|
"recall": total_recall / successful_requests, |
|
|
"f1_score": total_f1 / successful_requests, |
|
|
"average_latency_ms": total_latency / successful_requests, |
|
|
"successful_requests": successful_requests, |
|
|
"total_requests": len(examples) |
|
|
} |
|
|
|
|
|
|
|
|
self.results["entity_performance"] = {} |
|
|
for entity_type, counts in entity_type_metrics.items(): |
|
|
accuracy = counts["correct"] / counts["total"] if counts["total"] > 0 else 0.0 |
|
|
self.results["entity_performance"][entity_type] = { |
|
|
"accuracy": accuracy, |
|
|
"correct_predictions": counts["correct"], |
|
|
"total_predictions": counts["total"] |
|
|
} |
|
|
|
|
|
self.save_results() |
|
|
|
|
|
def save_results(self): |
|
|
"""Save benchmark results to files""" |
|
|
|
|
|
with open(self.config["output"]["detailed_results_file"], 'w') as f: |
|
|
json.dump(self.results, f, indent=2) |
|
|
|
|
|
|
|
|
summary = self.generate_summary() |
|
|
with open(self.config["output"]["results_file"], 'w') as f: |
|
|
f.write(summary) |
|
|
|
|
|
print("\n✅ Benchmark complete!") |
|
|
print(f"📄 Detailed results saved to: {self.config['output']['detailed_results_file']}") |
|
|
print(f"📊 Summary saved to: {self.config['output']['results_file']}") |
|
|
|
|
|
def generate_summary(self) -> str: |
|
|
"""Generate a human-readable benchmark summary""" |
|
|
m = self.results["metrics"] |
|
|
ep = self.results["entity_performance"] |
|
|
|
|
|
summary = f"""# NER Benchmark Results |
|
|
**Model:** {self.results['metadata']['model']} |
|
|
**Dataset:** {self.results['metadata']['dataset']} |
|
|
**Sample Size:** {self.results['metadata']['sample_size']} |
|
|
**Date:** {self.results['metadata']['timestamp']} |
|
|
|
|
|
## Overall Performance |
|
|
|
|
|
| Metric | Score | Description | |
|
|
|--------|-------|-------------| |
|
|
| F1 Score | {m.get('f1_score', 0):.3f} | Overall NER performance (harmonic mean of precision and recall) | |
|
|
| Precision | {m.get('precision', 0):.3f} | Accuracy of entity predictions | |
|
|
| Recall | {m.get('recall', 0):.3f} | Ability to find all entities | |
|
|
| Average Latency | {m.get('average_latency_ms', 0):.1f}ms | Response time performance | |
|
|
|
|
|
## Entity Type Performance |
|
|
|
|
|
""" |
|
|
if ep: |
|
|
summary += "| Entity Type | Accuracy | Correct/Total |\n" |
|
|
summary += "|-------------|----------|---------------|\n" |
|
|
for entity_type, stats in ep.items(): |
|
|
summary += f"| {entity_type} | {stats['accuracy']:.3f} | {stats['correct_predictions']}/{stats['total_predictions']} |\n" |
|
|
else: |
|
|
summary += "No entity type performance data available.\n" |
|
|
|
|
|
summary += """ |
|
|
## Key Improvements |
|
|
|
|
|
- **BIO Tagging**: Model outputs entities in BIO (Beginning-Inside-Outside) format |
|
|
- **Multiple Entity Types**: Supports PERSON, ORG, LOC, and MISC entities |
|
|
- **Entity-Level Evaluation**: Metrics calculated at entity level rather than token level |
|
|
- **Comprehensive Coverage**: Evaluates across different text domains |
|
|
|
|
|
""" |
|
|
|
|
|
if self.config["output"]["include_examples"] and self.results["examples"]: |
|
|
summary += "## Example Results\n\n" |
|
|
for i, example in enumerate(self.results["examples"][:3]): |
|
|
summary += f"### Example {i+1}\n" |
|
|
summary += f"**Input:** {example['input'][:100]}...\n" |
|
|
summary += f"**Predicted:** {example['predicted'][:200]}...\n" |
|
|
summary += f"**F1 Score:** {example['metrics']['f1']:.3f}\n\n" |
|
|
|
|
|
return summary |
|
|
|
|
|
def main(): |
|
|
if len(sys.argv) != 2: |
|
|
print("Usage: python run_benchmarks.py <config_file>") |
|
|
sys.exit(1) |
|
|
|
|
|
config_path = sys.argv[1] |
|
|
if not os.path.exists(config_path): |
|
|
print(f"Error: Config file {config_path} not found") |
|
|
sys.exit(1) |
|
|
|
|
|
runner = NERBenchmarkRunner(config_path) |
|
|
runner.run_benchmarks() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|