Spaces:

Qar-Raz
/

NLP-RAG

Runtime error

App Files Files Community

NLP-RAG / data_loader.py

Muddasri

Changed ingestion Logic

1c7cc69 3 days ago

raw

history blame contribute delete

3.09 kB

	import re
	import pandas as pd
	from typing import List, Dict, Any


	def load_cbt_book(file_path: str = "EntireBookCleaned.txt") -> pd.DataFrame:
	"""
	Loads the CBT book from a text file and parses it into documents.
	Each page is treated as a separate document.

	Args:
	file_path: Path to the cleaned book text file

	Returns:
	DataFrame with columns: id, title, url, full_text
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()
	except FileNotFoundError:
	raise FileNotFoundError(f"Book file not found: {file_path}")

	# Split content by page markers
	# Pattern matches "--- Page X ---" or "--- Page X of Y ---"
	page_pattern = r'---\sPage\s+(\d+)(?:\s+of\s+\d+)?\s---'

	# Split the content into pages
	pages = re.split(page_pattern, content)

	# pages[0] is content before first page marker (usually empty)
	# Then alternating: page_number, page_content, page_number, page_content...

	documents = []
	i = 1 # Start from first page number

	while i < len(pages) - 1:
	page_num = pages[i].strip()
	page_content = pages[i + 1].strip() if i + 1 < len(pages) else ""

	# Clean up the content - remove excessive whitespace
	page_content = re.sub(r'\n{3,}', '\n\n', page_content)
	page_content = page_content.strip()

	if page_content: # Only add non-empty pages
	# Extract a title from the first line if possible
	lines = page_content.split('\n')
	title_line = lines[0].strip() if lines else f"Page {page_num}"

	# Use first meaningful line as title, or default to page number
	if len(title_line) > 10 and len(title_line) < 200:
	title = title_line
	else:
	title = f"CBT Book - Page {page_num}"

	documents.append({
	"id": f"cbt-page-{page_num}",
	"title": title,
	"url": f"https://res.cloudinary.com/dajb4c1g5/image/upload/v1774864993/topic_pdfs/93/merged_pdf_1774864989649.pdf.pdf#page={page_num}",
	"full_text": page_content
	})

	i += 2 # Move to next page number

	if not documents:
	raise ValueError("No documents were parsed from the book file")

	df = pd.DataFrame(documents)
	print(f"Loaded {len(df)} pages from CBT book")
	return df


	def get_book_stats(df: pd.DataFrame) -> Dict[str, Any]:
	"""
	Get statistics about the loaded book.

	Args:
	df: DataFrame containing book pages

	Returns:
	Dictionary with statistics
	"""
	total_chars = df['full_text'].str.len().sum()
	avg_chars = df['full_text'].str.len().mean()

	return {
	"total_pages": len(df),
	"total_characters": total_chars,
	"average_chars_per_page": round(avg_chars, 2),
	"min_chars": df['full_text'].str.len().min(),
	"max_chars": df['full_text'].str.len().max()
	}