Update src/ingestion.py
Browse files- src/ingestion.py +37 -17
src/ingestion.py
CHANGED
|
@@ -90,46 +90,66 @@ def clean_text(text: str) -> str:
|
|
| 90 |
return text.strip()
|
| 91 |
|
| 92 |
|
| 93 |
-
# ==========================================================
|
| 94 |
-
# 3️⃣ TABLE OF CONTENTS DETECTION
|
| 95 |
-
# ==========================================================
|
| 96 |
# ==========================================================
|
| 97 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
|
| 98 |
# ==========================================================
|
| 99 |
def extract_table_of_contents(text: str):
|
| 100 |
"""
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
Returns list of (section_number, section_title).
|
| 104 |
"""
|
| 105 |
toc_entries = []
|
| 106 |
lines = text.split("\n")
|
| 107 |
toc_started = False
|
|
|
|
|
|
|
| 108 |
|
| 109 |
for i, line in enumerate(lines):
|
| 110 |
-
# Detect possible TOC header variants
|
| 111 |
-
if not toc_started and re.search(r"\b(table\s*of\s*contents
|
| 112 |
-
|
| 113 |
-
next_lines = lines[i + 1 : i + 6]
|
| 114 |
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
|
| 115 |
toc_started = True
|
| 116 |
continue
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
if match:
|
| 126 |
section = match.group(1).strip()
|
| 127 |
title = match.group(2).strip()
|
| 128 |
-
if len(title) > 3:
|
| 129 |
toc_entries.append((section, title))
|
| 130 |
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
# ==========================================================
|
|
|
|
| 90 |
return text.strip()
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
| 93 |
# ==========================================================
|
| 94 |
# 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
|
| 95 |
# ==========================================================
|
| 96 |
def extract_table_of_contents(text: str):
|
| 97 |
"""
|
| 98 |
+
Smart TOC detector for enterprise PDFs.
|
| 99 |
+
Handles 'Table of Contents', 'Contents', 'Content', 'Index', 'Overview',
|
| 100 |
+
and implicit numbered TOCs without a header.
|
| 101 |
Returns list of (section_number, section_title).
|
| 102 |
"""
|
| 103 |
toc_entries = []
|
| 104 |
lines = text.split("\n")
|
| 105 |
toc_started = False
|
| 106 |
+
toc_ended = False
|
| 107 |
+
line_count = len(lines)
|
| 108 |
|
| 109 |
for i, line in enumerate(lines):
|
| 110 |
+
# --- Step 1️⃣: Detect possible TOC header variants ---
|
| 111 |
+
if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
|
| 112 |
+
next_lines = lines[i + 1 : i + 8]
|
|
|
|
| 113 |
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
|
| 114 |
toc_started = True
|
| 115 |
continue
|
| 116 |
|
| 117 |
+
# --- Step 2️⃣: Smart fallback — detect implicit TOC without header ---
|
| 118 |
+
if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
|
| 119 |
+
numbered_lines = 0
|
| 120 |
+
for j in range(i, min(i + 5, line_count)):
|
| 121 |
+
if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
|
| 122 |
+
numbered_lines += 1
|
| 123 |
+
if numbered_lines >= 3: # heuristic to confirm pattern
|
| 124 |
+
toc_started = True
|
| 125 |
|
| 126 |
+
# --- Step 3️⃣: Detect end of TOC region ---
|
| 127 |
+
if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
|
| 128 |
+
toc_ended = True
|
| 129 |
+
break
|
| 130 |
+
|
| 131 |
+
# --- Step 4️⃣: Extract TOC entries ---
|
| 132 |
+
if toc_started and not toc_ended:
|
| 133 |
+
match = re.match(
|
| 134 |
+
r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
|
| 135 |
+
line.strip()
|
| 136 |
+
)
|
| 137 |
if match:
|
| 138 |
section = match.group(1).strip()
|
| 139 |
title = match.group(2).strip()
|
| 140 |
+
if len(title) > 3 and not re.match(r"^\d+$", title):
|
| 141 |
toc_entries.append((section, title))
|
| 142 |
|
| 143 |
+
# --- Step 5️⃣: Clean up duplicates ---
|
| 144 |
+
deduped = []
|
| 145 |
+
seen = set()
|
| 146 |
+
for sec, title in toc_entries:
|
| 147 |
+
key = (sec, title.lower())
|
| 148 |
+
if key not in seen:
|
| 149 |
+
deduped.append((sec, title))
|
| 150 |
+
seen.add(key)
|
| 151 |
|
| 152 |
+
return deduped
|
| 153 |
|
| 154 |
|
| 155 |
# ==========================================================
|