NLP-RAG / data_loader.py
Muddasri's picture
Changed ingestion Logic
1c7cc69
import re
import pandas as pd
from typing import List, Dict, Any
def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
"""
Loads the CBT book from a text file and parses it into documents.
Each page is treated as a separate document.
Args:
file_path: Path to the cleaned book text file
Returns:
DataFrame with columns: id, title, url, full_text
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
except FileNotFoundError:
raise FileNotFoundError(f"Book file not found: {file_path}")
# Split content by page markers
# Pattern matches "--- Page X ---" or "--- Page X of Y ---"
page_pattern = r'---\s*Page\s+(\d+)(?:\s+of\s+\d+)?\s*---'
# Split the content into pages
pages = re.split(page_pattern, content)
# pages[0] is content before first page marker (usually empty)
# Then alternating: page_number, page_content, page_number, page_content...
documents = []
i = 1 # Start from first page number
while i < len(pages) - 1:
page_num = pages[i].strip()
page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""
# Clean up the content - remove excessive whitespace
page_content = re.sub(r'\n{3,}', '\n\n', page_content)
page_content = page_content.strip()
if page_content: # Only add non-empty pages
# Extract a title from the first line if possible
lines = page_content.split('\n')
title_line = lines[0].strip() if lines else f"Page {page_num}"
# Use first meaningful line as title, or default to page number
if len(title_line) > 10 and len(title_line) < 200:
title = title_line
else:
title = f"CBT Book - Page {page_num}"
documents.append({
"id": f"cbt-page-{page_num}",
"title": title,
"url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
"full_text": page_content
})
i += 2 # Move to next page number
if not documents:
raise ValueError("No documents were parsed from the book file")
df = pd.DataFrame(documents)
print(f"Loaded {len(df)} pages from CBT book")
return df
def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
"""
Get statistics about the loaded book.
Args:
df: DataFrame containing book pages
Returns:
Dictionary with statistics
"""
total_chars = df['full_text'].str.len().sum()
avg_chars = df['full_text'].str.len().mean()
return {
"total_pages": len(df),
"total_characters": total_chars,
"average_chars_per_page": round(avg_chars, 2),
"min_chars": df['full_text'].str.len().min(),
"max_chars": df['full_text'].str.len().max()
}