Shubham170793 commited on
Commit
2f812ee
·
verified ·
1 Parent(s): e11a9ad

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +11 -66
src/ingestion.py CHANGED
@@ -1,35 +1,24 @@
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
4
- from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
5
 
6
  # ==========================================================
7
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
8
  # ==========================================================
9
  def extract_text_from_pdf(file_path: str):
10
- """
11
- Extracts and cleans text from a PDF using PyMuPDF.
12
- Handles layout artifacts, numbered sections, and TOC.
13
- Returns clean text + TOC list + source label.
14
- """
15
  text = ""
16
  try:
17
  with fitz.open(file_path) as pdf:
18
  for page_num, page in enumerate(pdf, start=1):
19
  page_text = page.get_text("text").strip()
20
-
21
- # Fallback: for scanned or weird layouts
22
  if not page_text:
23
  blocks = page.get_text("blocks")
24
  page_text = " ".join(
25
  block[4] for block in blocks if isinstance(block[4], str)
26
  )
27
-
28
- # Ensure bullets & numbered sections start on new lines
29
  page_text = page_text.replace("• ", "\n• ")
30
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
31
-
32
- # Remove headers/footers and confidential tags
33
  page_text = re.sub(
34
  r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
35
  )
@@ -39,16 +28,12 @@ def extract_text_from_pdf(file_path: str):
39
  page_text,
40
  flags=re.IGNORECASE,
41
  )
42
-
43
  text += page_text + "\n"
44
 
45
  except Exception as e:
46
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
47
 
48
- # --- Cleaning pipeline ---
49
  text = clean_text(text)
50
-
51
- # --- TOC extraction (Hybrid) ---
52
  toc, toc_source = get_hybrid_toc(text)
53
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
54
 
@@ -56,35 +41,21 @@ def extract_text_from_pdf(file_path: str):
56
 
57
 
58
  # ==========================================================
59
- # 2️⃣ ADVANCED CLEANING PIPELINE
60
  # ==========================================================
61
  def clean_text(text: str) -> str:
62
- """Cleans noisy PDF text before chunking and embedding."""
63
  text = unicodedata.normalize("NFKD", text)
64
-
65
- # Remove TOC noise (like "6.3.1 Prerequisites .............. 53")
66
- text = re.sub(
67
- r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text
68
- )
69
-
70
- # Replace bullet symbols and dots with consistent spacing
71
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
72
-
73
- # Remove excessive dots, hyphens, headers
74
  text = re.sub(r"\.{3,}", ". ", text)
75
  text = re.sub(r"-\s*\n", "", text)
76
  text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
77
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
78
-
79
- # Normalize newlines and spaces
80
  text = text.replace("\r", " ")
81
  text = re.sub(r"\n{2,}", "\n", text)
82
  text = re.sub(r"\s{2,}", " ", text)
83
-
84
- # Clean leftover special chars
85
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
86
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
87
-
88
  return text.strip()
89
 
90
 
@@ -92,12 +63,6 @@ def clean_text(text: str) -> str:
92
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
93
  # ==========================================================
94
  def extract_table_of_contents(text: str):
95
- """
96
- Smart TOC detector for enterprise PDFs.
97
- Handles 'Table of Contents', 'Contents', 'Content', 'Index', 'Overview',
98
- and implicit numbered TOCs without a header.
99
- Returns list of (section_number, section_title).
100
- """
101
  toc_entries = []
102
  lines = text.split("\n")
103
  toc_started = False
@@ -105,28 +70,24 @@ def extract_table_of_contents(text: str):
105
  line_count = len(lines)
106
 
107
  for i, line in enumerate(lines):
108
- # --- Step 1️⃣: Detect TOC header variants ---
109
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
110
  next_lines = lines[i + 1 : i + 8]
111
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
112
  toc_started = True
113
  continue
114
 
115
- # --- Step 2️⃣: Smart fallback — detect implicit TOC ---
116
  if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
117
  numbered_lines = 0
118
  for j in range(i, min(i + 5, line_count)):
119
  if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
120
  numbered_lines += 1
121
- if numbered_lines >= 3: # heuristic to confirm pattern
122
  toc_started = True
123
 
124
- # --- Step 3️⃣: Detect end of TOC region ---
125
  if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
126
  toc_ended = True
127
  break
128
 
129
- # --- Step 4️⃣: Extract TOC entries ---
130
  if toc_started and not toc_ended:
131
  match = re.match(
132
  r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
@@ -138,7 +99,6 @@ def extract_table_of_contents(text: str):
138
  if len(title) > 3 and not re.match(r"^\d+$", title):
139
  toc_entries.append((section, title))
140
 
141
- # --- Step 5️⃣: Clean up duplicates ---
142
  deduped = []
143
  seen = set()
144
  for sec, title in toc_entries:
@@ -146,7 +106,6 @@ def extract_table_of_contents(text: str):
146
  if key not in seen:
147
  deduped.append((sec, title))
148
  seen.add(key)
149
-
150
  return deduped
151
 
152
 
@@ -155,11 +114,12 @@ def extract_table_of_contents(text: str):
155
  # ==========================================================
156
  def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
157
  """
158
- Uses an LLM to infer a Table of Contents from the document text.
159
- Called only when no TOC is found via regex parsing.
160
  """
161
  snippet = text[:max_chars]
162
- llm = ChatOpenAI(model=model, temperature=0)
 
163
  prompt = f"""
164
  You are a document structure analyzer.
165
  Read the following text and infer its main section titles.
@@ -186,17 +146,12 @@ def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int
186
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
187
  # ==========================================================
188
  def get_hybrid_toc(text: str):
189
- """
190
- Attempts heuristic TOC extraction; if none found,
191
- triggers adaptive AI fallback.
192
- Returns (toc_entries, source_label).
193
- """
194
  toc_entries = extract_table_of_contents(text)
195
  if toc_entries:
196
  print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
197
  return toc_entries, "heuristic"
198
 
199
- print("⚠️ No TOC detected — invoking adaptive AI fallback...")
200
  toc_ai = adaptive_fallback_toc(text)
201
  if toc_ai:
202
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
@@ -207,13 +162,9 @@ def get_hybrid_toc(text: str):
207
 
208
 
209
  # ==========================================================
210
- # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
211
  # ==========================================================
212
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
213
- """
214
- Enhanced chunking for structured enterprise PDFs.
215
- Auto-selects chunk size and keeps procedural context intact.
216
- """
217
  text_length = len(text)
218
  if chunk_size is None:
219
  if text_length > 200000:
@@ -253,7 +204,6 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
253
 
254
  chunks = _merge_small_chunks(chunks, min_len=200)
255
 
256
- # Add continuity overlap
257
  final_chunks = []
258
  for i, ch in enumerate(chunks):
259
  if i == 0:
@@ -266,9 +216,6 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
266
  return final_chunks
267
 
268
 
269
- # ==========================================================
270
- # 5️⃣ Helper Functions
271
- # ==========================================================
272
  def _split_by_sentence(text, chunk_size=800, overlap=80):
273
  sentences = re.split(r"(?<=[.!?])\s+", text)
274
  chunks, current = [], ""
@@ -301,7 +248,7 @@ def _merge_small_chunks(chunks, min_len=150):
301
 
302
 
303
  # ==========================================================
304
- # 6️⃣ DEBUGGING (Manual Run)
305
  # ==========================================================
306
  if __name__ == "__main__":
307
  pdf_path = "sample.pdf"
@@ -309,5 +256,3 @@ if __name__ == "__main__":
309
  print("\n📚 TOC Preview:", toc[:5])
310
  chunks = chunk_text(text)
311
  print(f"\n✅ {len(chunks)} chunks created.")
312
- for i, c in enumerate(chunks[:5], 1):
313
- print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")
 
1
  import re
2
  import fitz # PyMuPDF
3
  import unicodedata
4
+ from langchain_openai import ChatOpenAI # ✅ FIXED: use native OpenAI for Hugging Face
5
 
6
  # ==========================================================
7
  # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
8
  # ==========================================================
9
  def extract_text_from_pdf(file_path: str):
 
 
 
 
 
10
  text = ""
11
  try:
12
  with fitz.open(file_path) as pdf:
13
  for page_num, page in enumerate(pdf, start=1):
14
  page_text = page.get_text("text").strip()
 
 
15
  if not page_text:
16
  blocks = page.get_text("blocks")
17
  page_text = " ".join(
18
  block[4] for block in blocks if isinstance(block[4], str)
19
  )
 
 
20
  page_text = page_text.replace("• ", "\n• ")
21
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
 
 
22
  page_text = re.sub(
23
  r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
24
  )
 
28
  page_text,
29
  flags=re.IGNORECASE,
30
  )
 
31
  text += page_text + "\n"
32
 
33
  except Exception as e:
34
  raise RuntimeError(f"❌ PDF extraction failed: {e}")
35
 
 
36
  text = clean_text(text)
 
 
37
  toc, toc_source = get_hybrid_toc(text)
38
  print(f"📘 TOC Source: {toc_source} | Entries: {len(toc)}")
39
 
 
41
 
42
 
43
  # ==========================================================
44
+ # 2️⃣ CLEANING PIPELINE
45
  # ==========================================================
46
  def clean_text(text: str) -> str:
 
47
  text = unicodedata.normalize("NFKD", text)
48
+ text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
 
 
 
 
 
 
49
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
 
 
50
  text = re.sub(r"\.{3,}", ". ", text)
51
  text = re.sub(r"-\s*\n", "", text)
52
  text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
53
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
 
 
54
  text = text.replace("\r", " ")
55
  text = re.sub(r"\n{2,}", "\n", text)
56
  text = re.sub(r"\s{2,}", " ", text)
 
 
57
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
58
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
 
59
  return text.strip()
60
 
61
 
 
63
  # 3️⃣ TABLE OF CONTENTS DETECTION (Heuristic)
64
  # ==========================================================
65
  def extract_table_of_contents(text: str):
 
 
 
 
 
 
66
  toc_entries = []
67
  lines = text.split("\n")
68
  toc_started = False
 
70
  line_count = len(lines)
71
 
72
  for i, line in enumerate(lines):
 
73
  if not toc_started and re.search(r"\b(table\s*of\s*contents?|contents?|index|overview)\b", line, re.IGNORECASE):
74
  next_lines = lines[i + 1 : i + 8]
75
  if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
76
  toc_started = True
77
  continue
78
 
 
79
  if not toc_started and re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", line):
80
  numbered_lines = 0
81
  for j in range(i, min(i + 5, line_count)):
82
  if re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", lines[j]):
83
  numbered_lines += 1
84
+ if numbered_lines >= 3:
85
  toc_started = True
86
 
 
87
  if toc_started and re.match(r"^\s*(Step\s*\d+|[A-Z][a-z]{2,}\s[A-Z])", line):
88
  toc_ended = True
89
  break
90
 
 
91
  if toc_started and not toc_ended:
92
  match = re.match(
93
  r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&(),-]+)(?:\.+\s*\d+)?$",
 
99
  if len(title) > 3 and not re.match(r"^\d+$", title):
100
  toc_entries.append((section, title))
101
 
 
102
  deduped = []
103
  seen = set()
104
  for sec, title in toc_entries:
 
106
  if key not in seen:
107
  deduped.append((sec, title))
108
  seen.add(key)
 
109
  return deduped
110
 
111
 
 
114
  # ==========================================================
115
  def adaptive_fallback_toc(text: str, model: str = "gpt-4o-mini", max_chars: int = 7000):
116
  """
117
+ Uses an OpenAI LLM to infer TOC from document text.
118
+ Works seamlessly on Hugging Face.
119
  """
120
  snippet = text[:max_chars]
121
+ llm = ChatOpenAI(model=model, temperature=0) # ✅ FIXED CONNECTOR
122
+
123
  prompt = f"""
124
  You are a document structure analyzer.
125
  Read the following text and infer its main section titles.
 
146
  # 3B️⃣ UNIFIED WRAPPER (Heuristic + AI Hybrid)
147
  # ==========================================================
148
  def get_hybrid_toc(text: str):
 
 
 
 
 
149
  toc_entries = extract_table_of_contents(text)
150
  if toc_entries:
151
  print(f"📘 TOC detected with {len(toc_entries)} entries (heuristic).")
152
  return toc_entries, "heuristic"
153
 
154
+ print("⚠️ No TOC detected — invoking AI fallback...")
155
  toc_ai = adaptive_fallback_toc(text)
156
  if toc_ai:
157
  print(f"✨ AI-inferred TOC generated with {len(toc_ai)} entries.")
 
162
 
163
 
164
  # ==========================================================
165
+ # 4️⃣ CHUNKING + HELPERS (unchanged)
166
  # ==========================================================
167
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
 
 
 
 
168
  text_length = len(text)
169
  if chunk_size is None:
170
  if text_length > 200000:
 
204
 
205
  chunks = _merge_small_chunks(chunks, min_len=200)
206
 
 
207
  final_chunks = []
208
  for i, ch in enumerate(chunks):
209
  if i == 0:
 
216
  return final_chunks
217
 
218
 
 
 
 
219
  def _split_by_sentence(text, chunk_size=800, overlap=80):
220
  sentences = re.split(r"(?<=[.!?])\s+", text)
221
  chunks, current = [], ""
 
248
 
249
 
250
  # ==========================================================
251
+ # 5️⃣ DEBUGGING
252
  # ==========================================================
253
  if __name__ == "__main__":
254
  pdf_path = "sample.pdf"
 
256
  print("\n📚 TOC Preview:", toc[:5])
257
  chunks = chunk_text(text)
258
  print(f"\n✅ {len(chunks)} chunks created.")