import re import pandas as pd from typing import List, Dict, Any def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame: """ Loads the CBT book from a text file and parses it into documents. Each page is treated as a separate document. Args: file_path: Path to the cleaned book text file Returns: DataFrame with columns: id, title, url, full_text """ try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() except FileNotFoundError: raise FileNotFoundError(f"Book file not found: {file_path}") # Split content by page markers # Pattern matches "--- Page X ---" or "--- Page X of Y ---" page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---' # Split the content into pages pages = re.split(page_pattern, content) # pages[0] is content before first page marker (usually empty) # Then alternating: page_number, page_content, page_number, page_content... documents = [] i = 1 # Start from first page number while i < len(pages) - 1: page_num = pages[i].strip() page_content = pages[i + 1].strip() if i + 1 < len(pages) else "" # Clean up the content - remove excessive whitespace page_content = re.sub(r'\n{3,}', '\n\n', page_content) page_content = page_content.strip() if page_content: # Only add non-empty pages # Extract a title from the first line if possible lines = page_content.split('\n') title_line = lines[0].strip() if lines else f"Page {page_num}" # Use first meaningful line as title, or default to page number if len(title_line) > 10 and len(title_line) < 200: title = title_line else: title = f"CBT Book - Page {page_num}" documents.append({ "id": f"cbt-page-{page_num}", "title": title, "url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}", "full_text": page_content }) i += 2 # Move to next page number if not documents: raise ValueError("No documents were parsed from the book file") df = pd.DataFrame(documents) print(f"Loaded {len(df)} pages from CBT book") return df def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]: """ Get statistics about the loaded book. Args: df: DataFrame containing book pages Returns: Dictionary with statistics """ total_chars = df['full_text'].str.len().sum() avg_chars = df['full_text'].str.len().mean() return { "total_pages": len(df), "total_characters": total_chars, "average_chars_per_page": round(avg_chars, 2), "min_chars": df['full_text'].str.len().min(), "max_chars": df['full_text'].str.len().max() }