Spaces:

Qar-Raz
/

NLP-RAG

Running

File size: 3,094 Bytes

import re
import pandas as pd
from typing import List, Dict, Any


def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
    """
    Loads the CBT book from a text file and parses it into documents.
    Each page is treated as a separate document.
    
    Args:
        file_path: Path to the cleaned book text file
        
    Returns:
        DataFrame with columns: id, title, url, full_text
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        raise FileNotFoundError(f"Book file not found: {file_path}")
    
    # Split content by page markers
    # Pattern matches "--- Page X ---" or "--- Page X of Y ---"
    page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---'
    
    # Split the content into pages
    pages = re.split(page_pattern, content)
    
    # pages[0] is content before first page marker (usually empty)
    # Then alternating: page_number, page_content, page_number, page_content...
    
    documents = []
    i = 1  # Start from first page number
    
    while i < len(pages) - 1:
        page_num = pages[i].strip()
        page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""
        
        # Clean up the content - remove excessive whitespace
        page_content = re.sub(r'\n{3,}', '\n\n', page_content)
        page_content = page_content.strip()
        
        if page_content:  # Only add non-empty pages
            # Extract a title from the first line if possible
            lines = page_content.split('\n')
            title_line = lines[0].strip() if lines else f"Page {page_num}"
            
            # Use first meaningful line as title, or default to page number
            if len(title_line) > 10 and len(title_line) < 200:
                title = title_line
            else:
                title = f"CBT Book - Page {page_num}"
            
            documents.append({
                "id": f"cbt-page-{page_num}",
                "title": title,
                "url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
                "full_text": page_content
            })
        
        i += 2  # Move to next page number
    
    if not documents:
        raise ValueError("No documents were parsed from the book file")
    
    df = pd.DataFrame(documents)
    print(f"Loaded {len(df)} pages from CBT book")
    return df


def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Get statistics about the loaded book.
    
    Args:
        df: DataFrame containing book pages
        
    Returns:
        Dictionary with statistics
    """
    total_chars = df['full_text'].str.len().sum()
    avg_chars = df['full_text'].str.len().mean()
    
    return {
        "total_pages": len(df),
        "total_characters": total_chars,
        "average_chars_per_page": round(avg_chars, 2),
        "min_chars": df['full_text'].str.len().min(),
        "max_chars": df['full_text'].str.len().max()
    }