Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 17

Commit

b61a150

verified ·

1 Parent(s): 32f64de

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +44 -38

src/ingestion.py CHANGED Viewed

@@ -9,11 +9,6 @@ def extract_text_from_pdf(file_path: str) -> str:
     """
     Extracts and cleans text from a PDF using PyMuPDF.
     Handles noisy layout artifacts, page numbers, and TOC dots.
-    Args:
-        file_path (str): Path to the PDF file.
-    Returns:
-        str: Cleaned, normalized text.
     """
     text = ""
     try:
@@ -24,15 +19,24 @@ def extract_text_from_pdf(file_path: str) -> str:
                 # Fallback: handle scanned or weirdly structured pages
                 if not page_text:
                     blocks = page.get_text("blocks")
-                    page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                 # 🔹 NEW: ensure bullets & numbered sections start on new lines
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
-                page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
-                page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
                 text += page_text + "\n"
@@ -48,16 +52,13 @@ def extract_text_from_pdf(file_path: str) -> str:
 # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
 # ==========================================================
 def clean_text(text: str) -> str:
-    """
-    Cleans noisy extracted PDF text before chunking and embedding.
-    Handles TOC artifacts, broken lines, bullets, and special characters.
-    """
-    # Normalize Unicode (e.g., weird quotes, ligatures)
     text = unicodedata.normalize("NFKD", text)
     # Remove TOC or numbering noise (e.g., “6.3.1 Prerequisites .............. 53”)
-    text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
     # Replace bullet symbols and dots with consistent spacing
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
@@ -67,7 +68,9 @@ def clean_text(text: str) -> str:
     text = re.sub(r"-\s*\n", "", text)
     # Remove page headers/footers (common in SAP docs)
-    text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
     text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
     # Normalize newlines → paragraph breaks
@@ -81,60 +84,63 @@ def clean_text(text: str) -> str:
     # Remove multiple section dots from TOC lines
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
-    # Trim and normalize spacing
-    text = text.strip()
-    return text
 # ==========================================================
-# 3️⃣ SMART CHUNKING (Step-Aware + Sentence Backup)
 # ==========================================================
-def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
     """
     Enhanced chunking for structured enterprise PDFs (SAP guides).
     ✅ Keeps bullet lists, numbered steps, and headings together.
-    ✅ Avoids breaking chunks mid-list or mid-section.
     """
     # Normalize whitespace
     text = re.sub(r"\s+", " ", text.strip())
-    # --- Step 1️⃣: Split into logical sections by headings or step titles ---
-    # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
-    section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
     sections = re.split(section_pattern, text)
     sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
     chunks = []
     for section in sections:
-        # --- Step 2️⃣: Merge multi-line bullets ---
-        # e.g., "- Ensure that..." or "• Activate the feature..."
         section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
         bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
         bullets = [b.strip() for b in bullets if b.strip()]
-        # Case A: Multiple bullets (keep as one coherent block)
         if len(bullets) > 2:
             combined = " ".join(bullets)
-            # If the bullet section is very long, split every few bullets
             if len(combined) > chunk_size * 1.5:
                 for i in range(0, len(bullets), 6):
                     block = " ".join(bullets[i:i+6])
                     chunks.append(block.strip())
             else:
                 chunks.append(combined.strip())
-        # Case B: Single bullet or normal paragraph → split by sentence
         else:
             chunks.extend(_split_by_sentence(section, chunk_size, overlap))
-    # --- Step 3️⃣: Merge small fragments to keep continuity ---
     chunks = _merge_small_chunks(chunks, min_len=200)
-    # --- Step 4️⃣: Ensure overlap continuity between neighboring chunks ---
     final_chunks = []
     for i, ch in enumerate(chunks):
         if i == 0:
@@ -152,7 +158,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
 # ==========================================================
 def _split_by_sentence(text, chunk_size=800, overlap=80):
     """Split by sentence punctuation to preserve semantics."""
-    sentences = re.split(r'(?<=[.!?])\s+', text)
     chunks, current = [], ""
     for sent in sentences:
         if len(current) + len(sent) + 1 <= chunk_size:
@@ -189,6 +195,6 @@ def _merge_small_chunks(chunks, min_len=150):
 if __name__ == "__main__":
     pdf_path = "sample.pdf"
     text = extract_text_from_pdf(pdf_path)
-    chunks = chunk_text(text, chunk_size=600, overlap=100)
     for i, c in enumerate(chunks[:5], 1):
         print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")

     """
     Extracts and cleans text from a PDF using PyMuPDF.
     Handles noisy layout artifacts, page numbers, and TOC dots.
     """
     text = ""
     try:
                 # Fallback: handle scanned or weirdly structured pages
                 if not page_text:
                     blocks = page.get_text("blocks")
+                    page_text = " ".join(
+                        block[4] for block in blocks if isinstance(block[4], str)
+                    )
                 # 🔹 NEW: ensure bullets & numbered sections start on new lines
                 page_text = page_text.replace("• ", "\n• ")
                 page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
+                page_text = re.sub(
+                    r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
+                )
+                page_text = re.sub(
+                    r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})",
+                    "",
+                    page_text,
+                    flags=re.IGNORECASE,
+                )
                 text += page_text + "\n"
 # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
 # ==========================================================
 def clean_text(text: str) -> str:
+    """Cleans noisy extracted PDF text before chunking and embedding."""
     text = unicodedata.normalize("NFKD", text)
     # Remove TOC or numbering noise (e.g., “6.3.1 Prerequisites .............. 53”)
+    text = re.sub(
+        r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text
+    )
     # Replace bullet symbols and dots with consistent spacing
     text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
     text = re.sub(r"-\s*\n", "", text)
     # Remove page headers/footers (common in SAP docs)
+    text = re.sub(
+        r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE
+    )
     text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
     # Normalize newlines → paragraph breaks
     # Remove multiple section dots from TOC lines
     text = re.sub(r"(\s*\.\s*){3,}", " ", text)
+    return text.strip()
 # ==========================================================
+# 3️⃣ SMART CHUNKING (Step-Aware + Auto-Sized)
 # ==========================================================
+def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     """
     Enhanced chunking for structured enterprise PDFs (SAP guides).
+    ✅ Auto-selects chunk size based on document length.
     ✅ Keeps bullet lists, numbered steps, and headings together.
     """
+    # --- Auto-tune chunk size based on document length ---
+    text_length = len(text)
+    if chunk_size is None:
+        if text_length > 200000:
+            chunk_size, overlap = 2000, 250
+        elif text_length > 50000:
+            chunk_size, overlap = 1500, 200
+        else:
+            chunk_size, overlap = 1000, 150
+    elif overlap is None:
+        overlap = 150
+    print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
     # Normalize whitespace
     text = re.sub(r"\s+", " ", text.strip())
+    # --- Step 1️⃣: Split into logical sections ---
+    section_pattern = (
+        r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
+    )
     sections = re.split(section_pattern, text)
     sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
     chunks = []
     for section in sections:
         section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
         bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
         bullets = [b.strip() for b in bullets if b.strip()]
         if len(bullets) > 2:
             combined = " ".join(bullets)
             if len(combined) > chunk_size * 1.5:
                 for i in range(0, len(bullets), 6):
                     block = " ".join(bullets[i:i+6])
                     chunks.append(block.strip())
             else:
                 chunks.append(combined.strip())
         else:
             chunks.extend(_split_by_sentence(section, chunk_size, overlap))
     chunks = _merge_small_chunks(chunks, min_len=200)
+    # --- Ensure overlap continuity ---
     final_chunks = []
     for i, ch in enumerate(chunks):
         if i == 0:
 # ==========================================================
 def _split_by_sentence(text, chunk_size=800, overlap=80):
     """Split by sentence punctuation to preserve semantics."""
+    sentences = re.split(r"(?<=[.!?])\s+", text)
     chunks, current = [], ""
     for sent in sentences:
         if len(current) + len(sent) + 1 <= chunk_size:
 if __name__ == "__main__":
     pdf_path = "sample.pdf"
     text = extract_text_from_pdf(pdf_path)
+    chunks = chunk_text(text)
     for i, c in enumerate(chunks[:5], 1):
         print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")