Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

499bde3

verified ·

1 Parent(s): abee1e5

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +37 -17

src/ingestion.py CHANGED Viewed

@@ -90,46 +90,66 @@ def clean_text(text: str) -> str:
     return text.strip()
-# ==========================================================
-# 3️⃣ TABLE OF CONTENTS DETECTION
-# ==========================================================
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
 # ==========================================================
 def extract_table_of_contents(text: str):
     """
-    Detects Table of Contents (TOC) in PDFs.
-    Supports variants like 'Contents', 'Index', or 'Overview'.
     Returns list of (section_number, section_title).
     """
     toc_entries = []
     lines = text.split("\n")
     toc_started = False
     for i, line in enumerate(lines):
-        # Detect possible TOC header variants
-        if not toc_started and re.search(r"\b(table\s*of\s*contents|contents|index|overview)\b", line, re.IGNORECASE):
-            # Confidence check — look ahead a few lines
-            next_lines = lines[i + 1 : i + 6]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
-        if toc_started:
-            # Stop scanning when main content starts (e.g., "Step 1:" or "1. Introduction")
-            if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
-                break
-            # Match lines like "3.2 Configure Endpoints ........ 13"
-            match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
             if match:
                 section = match.group(1).strip()
                 title = match.group(2).strip()
-                if len(title) > 3:
                     toc_entries.append((section, title))
-    return toc_entries
 # ==========================================================

     return text.strip()
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
 # ==========================================================
 def extract_table_of_contents(text: str):
     """
+    Smart TOC detector for enterprise PDFs.
+    Handles 'Table of Contents', 'Contents', 'Content', 'Index', 'Overview',
+    and implicit numbered TOCs without a header.
     Returns list of (section_number, section_title).
     """
     toc_entries = []
     lines = text.split("\n")
     toc_started = False
+    toc_ended = False
+    line_count = len(lines)
     for i, line in enumerate(lines):
+        # --- Step 1️⃣: Detect possible TOC header variants ---
+        if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
+            next_lines = lines[i + 1 : i + 8]
             if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
                 toc_started = True
                 continue
+        # --- Step 2️⃣: Smart fallback — detect implicit TOC without header ---
+        if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
+            numbered_lines = 0
+            for j in range(i, min(i + 5, line_count)):
+                if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
+                    numbered_lines += 1
+            if numbered_lines >= 3:  # heuristic to confirm pattern
+                toc_started = True
+        # --- Step 3️⃣: Detect end of TOC region ---
+        if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
+            toc_ended = True
+            break
+        # --- Step 4️⃣: Extract TOC entries ---
+        if toc_started and not toc_ended:
+            match = re.match(
+                r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
+                line.strip()
+            )
             if match:
                 section = match.group(1).strip()
                 title = match.group(2).strip()
+                if len(title) > 3 and not re.match(r"^\d+$", title):
                     toc_entries.append((section, title))
+    # --- Step 5️⃣: Clean up duplicates ---
+    deduped = []
+    seen = set()
+    for sec, title in toc_entries:
+        key = (sec, title.lower())
+        if key not in seen:
+            deduped.append((sec, title))
+            seen.add(key)
+    return deduped
 # ==========================================================