| import re |
| import pandas as pd |
| from typing import List, Dict, Any |
|
|
|
|
| def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame: |
| """ |
| Loads the CBT book from a text file and parses it into documents. |
| Each page is treated as a separate document. |
| |
| Args: |
| file_path: Path to the cleaned book text file |
| |
| Returns: |
| DataFrame with columns: id, title, url, full_text |
| """ |
| try: |
| with open(file_path, 'r', encoding='utf-8') as f: |
| content = f.read() |
| except FileNotFoundError: |
| raise FileNotFoundError(f"Book file not found: {file_path}") |
| |
| |
| |
| page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---' |
| |
| |
| pages = re.split(page_pattern, content) |
| |
| |
| |
| |
| documents = [] |
| i = 1 |
| |
| while i < len(pages) - 1: |
| page_num = pages[i].strip() |
| page_content = pages[i + 1].strip() if i + 1 < len(pages) else "" |
| |
| |
| page_content = re.sub(r'\n{3,}', '\n\n', page_content) |
| page_content = page_content.strip() |
| |
| if page_content: |
| |
| lines = page_content.split('\n') |
| title_line = lines[0].strip() if lines else f"Page {page_num}" |
| |
| |
| if len(title_line) > 10 and len(title_line) < 200: |
| title = title_line |
| else: |
| title = f"CBT Book - Page {page_num}" |
| |
| documents.append({ |
| "id": f"cbt-page-{page_num}", |
| "title": title, |
| "url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}", |
| "full_text": page_content |
| }) |
| |
| i += 2 |
| |
| if not documents: |
| raise ValueError("No documents were parsed from the book file") |
| |
| df = pd.DataFrame(documents) |
| print(f"Loaded {len(df)} pages from CBT book") |
| return df |
|
|
|
|
| def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]: |
| """ |
| Get statistics about the loaded book. |
| |
| Args: |
| df: DataFrame containing book pages |
| |
| Returns: |
| Dictionary with statistics |
| """ |
| total_chars = df['full_text'].str.len().sum() |
| avg_chars = df['full_text'].str.len().mean() |
| |
| return { |
| "total_pages": len(df), |
| "total_characters": total_chars, |
| "average_chars_per_page": round(avg_chars, 2), |
| "min_chars": df['full_text'].str.len().min(), |
| "max_chars": df['full_text'].str.len().max() |
| } |