Shubham170793 commited on
Commit
499bde3
·
verified ·
1 Parent(s): abee1e5

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +37 -17
src/ingestion.py CHANGED
@@ -90,46 +90,66 @@ def clean_text(text: str) -> str:
90
  return text.strip()
91
 
92
 
93
- # ==========================================================
94
- # 3️⃣ TABLE OF CONTENTS DETECTION
95
- # ==========================================================
96
  # ==========================================================
97
  # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
98
  # ==========================================================
99
  def extract_table_of_contents(text: str):
100
  """
101
- Detects Table of Contents (TOC) in PDFs.
102
- Supports variants like 'Contents', 'Index', or 'Overview'.
 
103
  Returns list of (section_number, section_title).
104
  """
105
  toc_entries = []
106
  lines = text.split("\n")
107
  toc_started = False
 
 
108
 
109
  for i, line in enumerate(lines):
110
- # Detect possible TOC header variants
111
- if not toc_started and re.search(r"\b(table\s*of\s*contents|contents|index|overview)\b", line, re.IGNORECASE):
112
- # Confidence check look ahead a few lines
113
- next_lines = lines[i + 1 : i + 6]
114
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
115
  toc_started = True
116
  continue
117
 
118
- if toc_started:
119
- # Stop scanning when main content starts (e.g., "Step 1:" or "1. Introduction")
120
- if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
121
- break
 
 
 
 
122
 
123
- # Match lines like "3.2 Configure Endpoints ........ 13"
124
- match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
 
 
 
 
 
 
 
 
 
125
  if match:
126
  section = match.group(1).strip()
127
  title = match.group(2).strip()
128
- if len(title) > 3:
129
  toc_entries.append((section, title))
130
 
131
- return toc_entries
 
 
 
 
 
 
 
132
 
 
133
 
134
 
135
  # ==========================================================
 
90
  return text.strip()
91
 
92
 
 
 
 
93
  # ==========================================================
94
  # 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
95
  # ==========================================================
96
  def extract_table_of_contents(text: str):
97
  """
98
+ Smart TOC detector for enterprise PDFs.
99
+ Handles 'Table of Contents', 'Contents', 'Content', 'Index', 'Overview',
100
+ and implicit numbered TOCs without a header.
101
  Returns list of (section_number, section_title).
102
  """
103
  toc_entries = []
104
  lines = text.split("\n")
105
  toc_started = False
106
+ toc_ended = False
107
+ line_count = len(lines)
108
 
109
  for i, line in enumerate(lines):
110
+ # --- Step 1️⃣: Detect possible TOC header variants ---
111
+ if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
112
+ next_lines = lines[i + 1 : i + 8]
 
113
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
114
  toc_started = True
115
  continue
116
 
117
+ # --- Step 2️⃣: Smart fallback — detect implicit TOC without header ---
118
+ if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
119
+ numbered_lines = 0
120
+ for j in range(i, min(i + 5, line_count)):
121
+ if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
122
+ numbered_lines += 1
123
+ if numbered_lines >= 3: # heuristic to confirm pattern
124
+ toc_started = True
125
 
126
+ # --- Step 3️⃣: Detect end of TOC region ---
127
+ if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
128
+ toc_ended = True
129
+ break
130
+
131
+ # --- Step 4️⃣: Extract TOC entries ---
132
+ if toc_started and not toc_ended:
133
+ match = re.match(
134
+ r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
135
+ line.strip()
136
+ )
137
  if match:
138
  section = match.group(1).strip()
139
  title = match.group(2).strip()
140
+ if len(title) > 3 and not re.match(r"^\d+$", title):
141
  toc_entries.append((section, title))
142
 
143
+ # --- Step 5️⃣: Clean up duplicates ---
144
+ deduped = []
145
+ seen = set()
146
+ for sec, title in toc_entries:
147
+ key = (sec, title.lower())
148
+ if key not in seen:
149
+ deduped.append((sec, title))
150
+ seen.add(key)
151
 
152
+ return deduped
153
 
154
 
155
  # ==========================================================