#!/usr/bin/env python3
"""
Enhanced BitTransformerLM Generation Testing
=============================================

Test the promising generation improvements:
1. Autoregressive generation with automatic parity correction
2. Longer sequence generation (50, 100, 200+ characters)
3. Optimized diffusion parameters (50+ steps)
4. Direct comparison between generation methods

Goal: See if we can get from "barely-contextual gibberish" to actual language!
"""

import sys
import torch
import torch.nn.functional as F
from datetime import datetime

sys.path.append('/data')
sys.path.append('/data/BitTransformerLM')

from bit_transformer import (
    BitTransformerLM,
    text_to_bits,
    bits_to_text,
    diffusion_inference,
    set_dropout,
    enforce_parity
)

def load_full_attention_model():
    """Load the full attention BitTransformerLM model."""
    print("🚀 Loading Full Attention BitTransformerLM for enhanced generation testing...")
    
    model = BitTransformerLM(
        d_model=512, nhead=16, num_layers=8, dim_feedforward=1024,
        max_seq_len=512, reversible=True, use_checkpoint=False,
        use_autocast=False, use_act=True, act_threshold=0.9,
        lambda_K=0.05, lambda_C=0.05, lambda_S=0.05,
        chunk_size=None, overlap=0, full_attn_logging=True
    )
    
    checkpoint_path = '/data/BitTransformerLM/checkpoints/checkpoint_best.pt'
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    set_dropout(model, 0.0)
    
    epoch = checkpoint.get('epoch', 'unknown')
    loss = checkpoint.get('loss', 'unknown')
    print(f"✅ Model loaded! Epoch: {epoch}, Loss: {loss}")
    
    return model

def autoregressive_generate_with_parity_correction(model, prompt, max_new_chars=20, temperature=0.7):
    """
    Autoregressive generation with automatic parity correction.
    This should solve the parity check failure issue that blocked autoregressive evaluation.
    """
    print(f"\n🔄 Autoregressive generation with parity correction:")
    print(f"   Prompt: '{prompt}' → generating {max_new_chars} characters...")
    
    # Convert prompt to bits
    input_bits = text_to_bits(prompt)
    generated_bits = input_bits.copy()
    
    with torch.no_grad():
        for char_idx in range(max_new_chars):
            char_bits = []
            
            # Generate 8 data bits + 1 parity bit per character
            for bit_idx in range(9):
                # Use last 400 bits as context
                context = generated_bits + char_bits
                context = context[-400:] if len(context) > 400 else context
                context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
                
                # Get next bit prediction
                logits, telemetry = model(context_tensor, causal=True)
                next_bit_logits = logits[0, -1, :]
                
                if bit_idx < 8:  # Data bits
                    # Apply temperature for controlled randomness
                    if temperature > 0:
                        next_bit_logits = next_bit_logits / temperature
                        probs = F.softmax(next_bit_logits, dim=-1)
                        next_bit = torch.multinomial(probs, 1).item()
                    else:
                        next_bit = torch.argmax(next_bit_logits).item()
                else:  # Parity bit - calculate correct parity
                    data_bits = char_bits[:8]
                    expected_parity = sum(data_bits) % 2
                    next_bit = expected_parity
                
                char_bits.append(next_bit)
            
            # Add character to generated sequence
            generated_bits.extend(char_bits)
        
        # Extract only the new bits (excluding prompt)
        new_bits = generated_bits[len(input_bits):]
        
        # Apply additional parity correction if needed
        new_bits_tensor = torch.tensor(new_bits, dtype=torch.long)
        corrected_bits_tensor, parity_corrections = enforce_parity(new_bits_tensor)
        corrected_bits = corrected_bits_tensor.tolist()
        
        try:
            # Decode new text
            decoded_text = bits_to_text(corrected_bits)
            full_result = prompt + decoded_text
            print(f"   ✅ SUCCESS: '{full_result}'")
            return {
                'success': True,
                'full_text': full_result,
                'new_text': decoded_text,
                'bits_generated': len(new_bits),
                'parity_corrections': parity_corrections
            }
        except Exception as e:
            print(f"   ❌ DECODE FAILED: {e}")
            return {
                'success': False,
                'error': str(e),
                'bits_generated': len(new_bits)
            }

def long_diffusion_generation(model, prompt, target_chars, steps=50):
    """
    Generate longer sequences with optimized diffusion parameters.
    """
    print(f"\n🌊 Long diffusion generation:")
    print(f"   Prompt: '{prompt}' → generating {target_chars} characters with {steps} steps...")
    
    try:
        # Generate longer continuation
        continuation_bits = target_chars * 9  # 9 bits per character
        generated_bits = diffusion_inference(
            model,
            length=continuation_bits,
            steps=steps,
            batch_size=1,
            init_bits=None,
            schedule="cosine"
        )
        
        # Decode result
        continuation_bits_list = generated_bits.squeeze().tolist()
        continuation_text = bits_to_text(continuation_bits_list)
        
        full_result = prompt + continuation_text
        print(f"   ✅ SUCCESS: '{full_result}'")
        
        return {
            'success': True,
            'full_text': full_result,
            'new_text': continuation_text,
            'bits_generated': len(continuation_bits_list),
            'diffusion_steps': steps
        }
        
    except Exception as e:
        print(f"   ❌ FAILED: {e}")
        return {
            'success': False,
            'error': str(e),
            'diffusion_steps': steps
        }

def test_length_scaling():
    """Test if longer generations produce more coherent results."""
    print("\n📏 === LENGTH SCALING TESTS ===")
    print("Testing if longer generations show improved coherence...")
    
    model = load_full_attention_model()
    test_prompts = ["Hello", "The weather today", "I think that"]
    target_lengths = [10, 25, 50]
    
    results = []
    
    for prompt in test_prompts:
        for length in target_lengths:
            print(f"\n--- Testing '{prompt}' → {length} chars ---")
            
            # Test autoregressive
            auto_result = autoregressive_generate_with_parity_correction(
                model, prompt, max_new_chars=length, temperature=0.6
            )
            
            # Test diffusion with high steps
            diff_result = long_diffusion_generation(
                model, prompt, target_chars=length, steps=50
            )
            
            results.append({
                'prompt': prompt,
                'target_length': length,
                'autoregressive': auto_result,
                'diffusion': diff_result
            })
    
    return results

def test_parameter_optimization():
    """Test different generation parameters for quality."""
    print("\n⚙️  === PARAMETER OPTIMIZATION TESTS ===")
    print("Testing different temperatures and diffusion steps...")
    
    model = load_full_attention_model()
    prompt = "Hello world"
    
    results = []
    
    # Test different temperatures for autoregressive
    print("\n🌡️  Testing autoregressive temperatures:")
    for temp in [0.1, 0.5, 0.8, 1.0, 1.2]:
        print(f"\n--- Temperature {temp} ---")
        result = autoregressive_generate_with_parity_correction(
            model, prompt, max_new_chars=20, temperature=temp
        )
        results.append({
            'method': 'autoregressive',
            'temperature': temp,
            'result': result
        })
    
    # Test different diffusion steps
    print("\n🌊 Testing diffusion steps:")
    for steps in [10, 25, 50, 100]:
        print(f"\n--- {steps} steps ---")
        result = long_diffusion_generation(
            model, prompt, target_chars=20, steps=steps
        )
        results.append({
            'method': 'diffusion',
            'steps': steps,
            'result': result
        })
    
    return results

def test_coherence_prompts():
    """Test with prompts that should elicit more coherent responses."""
    print("\n🎯 === COHERENCE PROMPTS TESTS ===")
    print("Testing prompts designed to elicit coherent language patterns...")
    
    model = load_full_attention_model()
    
    # Prompts that might elicit more structured responses
    coherence_prompts = [
        "Once upon a time",
        "The quick brown fox",
        "In the beginning",
        "Python code to print hello:",
        "def main():",
        "SELECT * FROM",
        "Today is a beautiful",
        "My name is",
        "The answer is",
        "import torch"
    ]
    
    results = []
    
    for prompt in coherence_prompts:
        print(f"\n--- Testing coherence with: '{prompt}' ---")
        
        # Test both methods with longer generation
        auto_result = autoregressive_generate_with_parity_correction(
            model, prompt, max_new_chars=30, temperature=0.7
        )
        
        diff_result = long_diffusion_generation(
            model, prompt, target_chars=30, steps=75
        )
        
        results.append({
            'prompt': prompt,
            'autoregressive': auto_result,
            'diffusion': diff_result
        })
        
        # Quick analysis
        if auto_result.get('success'):
            auto_text = auto_result.get('new_text', '')
            if any(word in auto_text.lower() for word in ['the', 'and', 'is', 'in', 'to', 'a']):
                print(f"   🎉 Autoregressive contains common words!")
        
        if diff_result.get('success'):
            diff_text = diff_result.get('new_text', '')
            if any(word in diff_text.lower() for word in ['the', 'and', 'is', 'in', 'to', 'a']):
                print(f"   🎉 Diffusion contains common words!")
    
    return results

def main():
    """Run all enhanced generation tests."""
    print("🚀 ENHANCED BITRANSFORMERLM GENERATION TESTING")
    print("=" * 60)
    print("Testing potential fixes:")
    print("1. Autoregressive with parity correction")  
    print("2. Longer sequence generation")
    print("3. Optimized generation parameters")
    print("4. Coherence-focused prompts")
    print("=" * 60)
    
    # Run all tests
    length_results = test_length_scaling()
    param_results = test_parameter_optimization()
    coherence_results = test_coherence_prompts()
    
    # Summary analysis
    print("\n🎯 === OVERALL ANALYSIS ===")
    
    # Count successes
    total_auto = len([r for results in [length_results, coherence_results] 
                     for r in results if r.get('autoregressive', {}).get('success')])
    total_diff = len([r for results in [length_results, coherence_results]
                     for r in results if r.get('diffusion', {}).get('success')])
    
    print(f"Autoregressive success rate: {total_auto}/24")
    print(f"Diffusion success rate: {total_diff}/24")
    
    # Look for promising outputs
    print("\n🔍 Looking for signs of linguistic improvement...")
    
    all_results = length_results + coherence_results
    promising_outputs = []
    
    for result in all_results:
        for method in ['autoregressive', 'diffusion']:
            if result.get(method, {}).get('success'):
                text = result[method].get('new_text', '')
                # Check for word-like patterns
                if len(text) > 10 and any(c.isalpha() for c in text):
                    words = text.split()
                    if any(len(word) > 2 and word.isalpha() for word in words):
                        promising_outputs.append({
                            'prompt': result['prompt'],
                            'method': method,
                            'text': text
                        })
    
    if promising_outputs:
        print(f"\n🎉 Found {len(promising_outputs)} promising outputs with word-like patterns!")
        for output in promising_outputs[:5]:  # Show first 5
            print(f"   {output['method']}: '{output['prompt']}' → '{output['text']}'")
    else:
        print("\n💭 No clear word patterns found yet - model may need more training or different approach")
    
    return {
        'length_results': length_results,
        'param_results': param_results,
        'coherence_results': coherence_results,
        'summary': {
            'autoregressive_successes': total_auto,
            'diffusion_successes': total_diff,
            'promising_outputs': len(promising_outputs)
        }
    }

if __name__ == "__main__":
    results = main()