Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Score Utilizer - Extract and utilize highest-scoring pages from retrieval logs | |
| This module provides utilities to parse log outputs and retrieve the best pages based on scores. | |
| """ | |
| import re | |
| import json | |
| from typing import List, Dict, Tuple, Optional | |
| class ScoreUtilizer: | |
| """ | |
| Utility class to extract and utilize highest-scoring pages from retrieval logs | |
| """ | |
| def __init__(self): | |
| self.score_patterns = { | |
| 'page_score': r'Page\s+(\d+)\s+\(doc_id:\s*(\d+)\)\s*\|\s*Score:\s*([\d.]+)', | |
| 'highest_scoring': r'(\d+)\.\s*Page\s+(\d+)\s+-\s*Score:\s*([\d.]+)', | |
| 'relevance_level': r'([π’π‘π π΅π£π΄])\s+([A-Z\s]+)\s+-\s+(.+)' | |
| } | |
| def parse_log_output(self, log_text: str) -> Dict: | |
| """ | |
| Parse log output to extract page scores and relevance information | |
| Args: | |
| log_text: Raw log output from the retrieval system | |
| Returns: | |
| Dictionary containing parsed page scores and metadata | |
| """ | |
| print("π PARSING LOG OUTPUT FOR HIGHEST-SCORING PAGES") | |
| print("=" * 60) | |
| # Extract page scores | |
| page_scores = self._extract_page_scores(log_text) | |
| # Extract highest scoring pages | |
| top_pages = self._extract_top_pages(log_text) | |
| # Extract relevance distribution | |
| relevance_dist = self._extract_relevance_distribution(log_text) | |
| # Extract statistics | |
| stats = self._extract_statistics(log_text) | |
| result = { | |
| 'page_scores': page_scores, | |
| 'top_pages': top_pages, | |
| 'relevance_distribution': relevance_dist, | |
| 'statistics': stats, | |
| 'parsed_at': self._get_timestamp() | |
| } | |
| print(f"β Successfully parsed {len(page_scores)} page scores") | |
| print(f"π Found {len(top_pages)} top-scoring pages") | |
| print("=" * 60) | |
| return result | |
| def _extract_page_scores(self, log_text: str) -> List[Dict]: | |
| """Extract individual page scores from log text""" | |
| page_scores = [] | |
| # Pattern: "Page 1 (doc_id: 0) | Score: 0.9234 | π’ EXCELLENT - Highly relevant" | |
| pattern = self.score_patterns['page_score'] | |
| matches = re.findall(pattern, log_text) | |
| for match in matches: | |
| page_num, doc_id, score = match | |
| page_scores.append({ | |
| 'page_number': int(page_num), | |
| 'doc_id': int(doc_id), | |
| 'score': float(score), | |
| 'relevance_level': self._get_relevance_level(float(score)) | |
| }) | |
| # Sort by score (highest first) | |
| page_scores.sort(key=lambda x: x['score'], reverse=True) | |
| return page_scores | |
| def _extract_top_pages(self, log_text: str) -> List[Dict]: | |
| """Extract top-scoring pages from log text""" | |
| top_pages = [] | |
| # Pattern: "1. Page 1 - Score: 0.9234" | |
| pattern = self.score_patterns['highest_scoring'] | |
| matches = re.findall(pattern, log_text) | |
| for match in matches: | |
| rank, page_num, score = match | |
| top_pages.append({ | |
| 'rank': int(rank), | |
| 'page_number': int(page_num), | |
| 'score': float(score), | |
| 'relevance_level': self._get_relevance_level(float(score)) | |
| }) | |
| return top_pages | |
| def _extract_relevance_distribution(self, log_text: str) -> Dict: | |
| """Extract relevance distribution from log text""" | |
| distribution = { | |
| 'excellent': 0, | |
| 'very_good': 0, | |
| 'good': 0, | |
| 'moderate': 0, | |
| 'basic': 0, | |
| 'poor': 0 | |
| } | |
| # Look for distribution lines like "π’ Excellent (β₯0.90): 2 pages" | |
| patterns = { | |
| 'excellent': r'π’\s+Excellent.*?(\d+)\s+pages?', | |
| 'very_good': r'π‘\s+Very Good.*?(\d+)\s+pages?', | |
| 'good': r'π \s+Good.*?(\d+)\s+pages?', | |
| 'moderate': r'π΅\s+Moderate.*?(\d+)\s+pages?', | |
| 'basic': r'π£\s+Basic.*?(\d+)\s+pages?', | |
| 'poor': r'π΄\s+Poor.*?(\d+)\s+pages?' | |
| } | |
| for level, pattern in patterns.items(): | |
| match = re.search(pattern, log_text) | |
| if match: | |
| distribution[level] = int(match.group(1)) | |
| return distribution | |
| def _extract_statistics(self, log_text: str) -> Dict: | |
| """Extract statistical information from log text""" | |
| stats = {} | |
| # Extract average score | |
| avg_match = re.search(r'Average.*?Score:\s*([\d.]+)', log_text) | |
| if avg_match: | |
| stats['average_score'] = float(avg_match.group(1)) | |
| # Extract highest score | |
| high_match = re.search(r'Highest.*?Score:\s*([\d.]+)', log_text) | |
| if high_match: | |
| stats['highest_score'] = float(high_match.group(1)) | |
| # Extract lowest score | |
| low_match = re.search(r'Lowest.*?Score:\s*([\d.]+)', log_text) | |
| if low_match: | |
| stats['lowest_score'] = float(low_match.group(1)) | |
| # Extract total pages | |
| total_match = re.search(r'Total.*?(\d+).*?results?', log_text) | |
| if total_match: | |
| stats['total_pages'] = int(total_match.group(1)) | |
| return stats | |
| def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 3) -> List[Dict]: | |
| """ | |
| Get the highest-scoring pages from parsed data | |
| Args: | |
| parsed_data: Parsed log data from parse_log_output() | |
| count: Number of top pages to return | |
| Returns: | |
| List of highest-scoring pages | |
| """ | |
| if 'page_scores' not in parsed_data: | |
| return [] | |
| return parsed_data['page_scores'][:count] | |
| def get_pages_by_threshold(self, parsed_data: Dict, threshold: float = 0.80) -> List[Dict]: | |
| """ | |
| Get pages that meet or exceed a score threshold | |
| Args: | |
| parsed_data: Parsed log data from parse_log_output() | |
| threshold: Minimum score threshold | |
| Returns: | |
| List of pages meeting the threshold | |
| """ | |
| if 'page_scores' not in parsed_data: | |
| return [] | |
| return [page for page in parsed_data['page_scores'] if page['score'] >= threshold] | |
| def get_pages_by_relevance_level(self, parsed_data: Dict, level: str = 'excellent') -> List[Dict]: | |
| """ | |
| Get pages by specific relevance level | |
| Args: | |
| parsed_data: Parsed log data from parse_log_output() | |
| level: Relevance level ('excellent', 'very_good', 'good', 'moderate', 'basic', 'poor') | |
| Returns: | |
| List of pages with the specified relevance level | |
| """ | |
| if 'page_scores' not in parsed_data: | |
| return [] | |
| level_mapping = { | |
| 'excellent': 'π’ EXCELLENT', | |
| 'very_good': 'π‘ VERY GOOD', | |
| 'good': 'π GOOD', | |
| 'moderate': 'π΅ MODERATE', | |
| 'basic': 'π£ BASIC', | |
| 'poor': 'π΄ POOR' | |
| } | |
| target_level = level_mapping.get(level, 'π’ EXCELLENT') | |
| return [page for page in parsed_data['page_scores'] if target_level in page['relevance_level']] | |
| def generate_utilization_report(self, parsed_data: Dict) -> str: | |
| """ | |
| Generate a comprehensive report on how to utilize the highest-scoring pages | |
| Args: | |
| parsed_data: Parsed log data from parse_log_output() | |
| Returns: | |
| Formatted report string | |
| """ | |
| report = [] | |
| report.append("π HIGHEST-SCORING PAGES UTILIZATION REPORT") | |
| report.append("=" * 60) | |
| # Top pages summary | |
| top_pages = self.get_highest_scoring_pages(parsed_data, 3) | |
| report.append(f"\nπ TOP 3 HIGHEST-SCORING PAGES:") | |
| for i, page in enumerate(top_pages, 1): | |
| report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})") | |
| # Threshold-based recommendations | |
| excellent_pages = self.get_pages_by_threshold(parsed_data, 0.90) | |
| very_good_pages = self.get_pages_by_threshold(parsed_data, 0.80) | |
| report.append(f"\nπ― UTILIZATION RECOMMENDATIONS:") | |
| report.append(f" π’ Excellent pages (β₯0.90): {len(excellent_pages)} pages - Use for primary context") | |
| report.append(f" π‘ Very Good pages (β₯0.80): {len(very_good_pages)} pages - Use for comprehensive coverage") | |
| # Statistics | |
| if 'statistics' in parsed_data and parsed_data['statistics']: | |
| stats = parsed_data['statistics'] | |
| report.append(f"\nπ QUALITY METRICS:") | |
| if 'average_score' in stats: | |
| report.append(f" Average Score: {stats['average_score']:.4f}") | |
| if 'highest_score' in stats: | |
| report.append(f" Highest Score: {stats['highest_score']:.4f}") | |
| if 'total_pages' in stats: | |
| report.append(f" Total Pages Analyzed: {stats['total_pages']}") | |
| # Usage suggestions | |
| report.append(f"\nπ‘ USAGE SUGGESTIONS:") | |
| report.append(f" 1. System automatically uses top 3 pages for RAG responses") | |
| report.append(f" 2. Excellent pages provide primary context") | |
| report.append(f" 3. Very good pages ensure comprehensive coverage") | |
| report.append(f" 4. Top 3 selection optimizes response quality") | |
| report.append("=" * 60) | |
| return "\n".join(report) | |
| def _get_relevance_level(self, score: float) -> str: | |
| """Get relevance level based on score""" | |
| if score >= 0.90: | |
| return "π’ EXCELLENT - Highly relevant" | |
| elif score >= 0.80: | |
| return "π‘ VERY GOOD - Very relevant" | |
| elif score >= 0.70: | |
| return "π GOOD - Relevant" | |
| elif score >= 0.60: | |
| return "π΅ MODERATE - Somewhat relevant" | |
| elif score >= 0.50: | |
| return "π£ BASIC - Minimally relevant" | |
| else: | |
| return "π΄ POOR - Not relevant" | |
| def _get_timestamp(self) -> str: | |
| """Get current timestamp""" | |
| from datetime import datetime | |
| return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| # Example usage function | |
| def demonstrate_score_utilization(): | |
| """ | |
| Demonstrate how to use the ScoreUtilizer to extract and utilize highest-scoring pages | |
| """ | |
| print("π§ͺ DEMONSTRATING SCORE UTILIZATION") | |
| print("=" * 60) | |
| # Example log output (this would come from your actual retrieval system) | |
| example_log = """ | |
| ================================================================================ | |
| π RETRIEVAL SCORES - PAGE NUMBERS WITH HIGHEST SCORES | |
| ================================================================================ | |
| π Collection: documents_20250101_120000 | |
| π Total documents found: 15 | |
| π― Requested top-k: 5 | |
| -------------------------------------------------------------------------------- | |
| π Page 1 (doc_id: 0) | Score: 0.9234 | π’ EXCELLENT - Highly relevant | |
| π Page 3 (doc_id: 2) | Score: 0.8756 | π‘ VERY GOOD - Very relevant | |
| π Page 7 (doc_id: 6) | Score: 0.8123 | π‘ VERY GOOD - Very relevant | |
| π Page 2 (doc_id: 1) | Score: 0.7890 | π GOOD - Relevant | |
| π Page 5 (doc_id: 4) | Score: 0.7456 | π GOOD - Relevant | |
| -------------------------------------------------------------------------------- | |
| π HIGHEST SCORING PAGES: | |
| 1. Page 1 - Score: 0.9234 | |
| 2. Page 3 - Score: 0.8756 | |
| 3. Page 7 - Score: 0.8123 | |
| ================================================================================ | |
| """ | |
| # Initialize utilizer | |
| utilizer = ScoreUtilizer() | |
| # Parse the log output | |
| parsed_data = utilizer.parse_log_output(example_log) | |
| # Get highest-scoring pages | |
| top_pages = utilizer.get_highest_scoring_pages(parsed_data, 3) | |
| print(f"\nπ TOP 3 HIGHEST-SCORING PAGES:") | |
| for page in top_pages: | |
| print(f" Page {page['page_number']} - Score: {page['score']:.4f}") | |
| # Get pages by threshold | |
| excellent_pages = utilizer.get_pages_by_threshold(parsed_data, 0.90) | |
| print(f"\nπ’ EXCELLENT PAGES (β₯0.90): {len(excellent_pages)} pages") | |
| # Generate utilization report | |
| report = utilizer.generate_utilization_report(parsed_data) | |
| print(f"\n{report}") | |
| print("\nβ Score utilization demonstration completed!") | |
| if __name__ == "__main__": | |
| demonstrate_score_utilization() | |