Shubham170793 commited on
Commit
b61a150
·
verified ·
1 Parent(s): 32f64de

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +44 -38
src/ingestion.py CHANGED
@@ -9,11 +9,6 @@ def extract_text_from_pdf(file_path: str) -> str:
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
  Handles noisy layout artifacts, page numbers, and TOC dots.
12
-
13
- Args:
14
- file_path (str): Path to the PDF file.
15
- Returns:
16
- str: Cleaned, normalized text.
17
  """
18
  text = ""
19
  try:
@@ -24,15 +19,24 @@ def extract_text_from_pdf(file_path: str) -> str:
24
  # Fallback: handle scanned or weirdly structured pages
25
  if not page_text:
26
  blocks = page.get_text("blocks")
27
- page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
 
 
28
 
29
  # 🔹 NEW: ensure bullets & numbered sections start on new lines
30
  page_text = page_text.replace("• ", "\n• ")
31
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
32
 
33
  # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
34
- page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
35
- page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
36
 
37
  text += page_text + "\n"
38
 
@@ -48,16 +52,13 @@ def extract_text_from_pdf(file_path: str) -> str:
48
  # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
49
  # ==========================================================
50
  def clean_text(text: str) -> str:
51
- """
52
- Cleans noisy extracted PDF text before chunking and embedding.
53
- Handles TOC artifacts, broken lines, bullets, and special characters.
54
- """
55
-
56
- # Normalize Unicode (e.g., weird quotes, ligatures)
57
  text = unicodedata.normalize("NFKD", text)
58
 
59
  # Remove TOC or numbering noise (e.g., “6.3.1 Prerequisites .............. 53”)
60
- text = re.sub(r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text)
 
 
61
 
62
  # Replace bullet symbols and dots with consistent spacing
63
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
@@ -67,7 +68,9 @@ def clean_text(text: str) -> str:
67
  text = re.sub(r"-\s*\n", "", text)
68
 
69
  # Remove page headers/footers (common in SAP docs)
70
- text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
 
 
71
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
72
 
73
  # Normalize newlines → paragraph breaks
@@ -81,60 +84,63 @@ def clean_text(text: str) -> str:
81
  # Remove multiple section dots from TOC lines
82
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
83
 
84
- # Trim and normalize spacing
85
- text = text.strip()
86
-
87
- return text
88
 
89
 
90
  # ==========================================================
91
- # 3️⃣ SMART CHUNKING (Step-Aware + Sentence Backup)
92
  # ==========================================================
93
- def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
94
  """
95
  Enhanced chunking for structured enterprise PDFs (SAP guides).
 
96
  ✅ Keeps bullet lists, numbered steps, and headings together.
97
- ✅ Avoids breaking chunks mid-list or mid-section.
98
  """
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # Normalize whitespace
101
  text = re.sub(r"\s+", " ", text.strip())
102
 
103
- # --- Step 1️⃣: Split into logical sections by headings or step titles ---
104
- # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
105
- section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
 
106
  sections = re.split(section_pattern, text)
107
  sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
108
 
109
  chunks = []
110
-
111
  for section in sections:
112
- # --- Step 2️⃣: Merge multi-line bullets ---
113
- # e.g., "- Ensure that..." or "• Activate the feature..."
114
  section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
115
  bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
116
  bullets = [b.strip() for b in bullets if b.strip()]
117
 
118
- # Case A: Multiple bullets (keep as one coherent block)
119
  if len(bullets) > 2:
120
  combined = " ".join(bullets)
121
-
122
- # If the bullet section is very long, split every few bullets
123
  if len(combined) > chunk_size * 1.5:
124
  for i in range(0, len(bullets), 6):
125
  block = " ".join(bullets[i:i+6])
126
  chunks.append(block.strip())
127
  else:
128
  chunks.append(combined.strip())
129
-
130
- # Case B: Single bullet or normal paragraph → split by sentence
131
  else:
132
  chunks.extend(_split_by_sentence(section, chunk_size, overlap))
133
 
134
- # --- Step 3️⃣: Merge small fragments to keep continuity ---
135
  chunks = _merge_small_chunks(chunks, min_len=200)
136
 
137
- # --- Step 4️⃣: Ensure overlap continuity between neighboring chunks ---
138
  final_chunks = []
139
  for i, ch in enumerate(chunks):
140
  if i == 0:
@@ -152,7 +158,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
152
  # ==========================================================
153
  def _split_by_sentence(text, chunk_size=800, overlap=80):
154
  """Split by sentence punctuation to preserve semantics."""
155
- sentences = re.split(r'(?<=[.!?])\s+', text)
156
  chunks, current = [], ""
157
  for sent in sentences:
158
  if len(current) + len(sent) + 1 <= chunk_size:
@@ -189,6 +195,6 @@ def _merge_small_chunks(chunks, min_len=150):
189
  if __name__ == "__main__":
190
  pdf_path = "sample.pdf"
191
  text = extract_text_from_pdf(pdf_path)
192
- chunks = chunk_text(text, chunk_size=600, overlap=100)
193
  for i, c in enumerate(chunks[:5], 1):
194
  print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")
 
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
  Handles noisy layout artifacts, page numbers, and TOC dots.
 
 
 
 
 
12
  """
13
  text = ""
14
  try:
 
19
  # Fallback: handle scanned or weirdly structured pages
20
  if not page_text:
21
  blocks = page.get_text("blocks")
22
+ page_text = " ".join(
23
+ block[4] for block in blocks if isinstance(block[4], str)
24
+ )
25
 
26
  # 🔹 NEW: ensure bullets & numbered sections start on new lines
27
  page_text = page_text.replace("• ", "\n• ")
28
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
29
 
30
  # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
31
+ page_text = re.sub(
32
+ r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
33
+ )
34
+ page_text = re.sub(
35
+ r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})",
36
+ "",
37
+ page_text,
38
+ flags=re.IGNORECASE,
39
+ )
40
 
41
  text += page_text + "\n"
42
 
 
52
  # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
53
  # ==========================================================
54
  def clean_text(text: str) -> str:
55
+ """Cleans noisy extracted PDF text before chunking and embedding."""
 
 
 
 
 
56
  text = unicodedata.normalize("NFKD", text)
57
 
58
  # Remove TOC or numbering noise (e.g., “6.3.1 Prerequisites .............. 53”)
59
+ text = re.sub(
60
+ r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text
61
+ )
62
 
63
  # Replace bullet symbols and dots with consistent spacing
64
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
 
68
  text = re.sub(r"-\s*\n", "", text)
69
 
70
  # Remove page headers/footers (common in SAP docs)
71
+ text = re.sub(
72
+ r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE
73
+ )
74
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
75
 
76
  # Normalize newlines → paragraph breaks
 
84
  # Remove multiple section dots from TOC lines
85
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
86
 
87
+ return text.strip()
 
 
 
88
 
89
 
90
  # ==========================================================
91
+ # 3️⃣ SMART CHUNKING (Step-Aware + Auto-Sized)
92
  # ==========================================================
93
+ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
94
  """
95
  Enhanced chunking for structured enterprise PDFs (SAP guides).
96
+ ✅ Auto-selects chunk size based on document length.
97
  ✅ Keeps bullet lists, numbered steps, and headings together.
 
98
  """
99
 
100
+ # --- Auto-tune chunk size based on document length ---
101
+ text_length = len(text)
102
+ if chunk_size is None:
103
+ if text_length > 200000:
104
+ chunk_size, overlap = 2000, 250
105
+ elif text_length > 50000:
106
+ chunk_size, overlap = 1500, 200
107
+ else:
108
+ chunk_size, overlap = 1000, 150
109
+ elif overlap is None:
110
+ overlap = 150
111
+
112
+ print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
113
+
114
  # Normalize whitespace
115
  text = re.sub(r"\s+", " ", text.strip())
116
 
117
+ # --- Step 1️⃣: Split into logical sections ---
118
+ section_pattern = (
119
+ r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
120
+ )
121
  sections = re.split(section_pattern, text)
122
  sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
123
 
124
  chunks = []
 
125
  for section in sections:
 
 
126
  section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
127
  bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
128
  bullets = [b.strip() for b in bullets if b.strip()]
129
 
 
130
  if len(bullets) > 2:
131
  combined = " ".join(bullets)
 
 
132
  if len(combined) > chunk_size * 1.5:
133
  for i in range(0, len(bullets), 6):
134
  block = " ".join(bullets[i:i+6])
135
  chunks.append(block.strip())
136
  else:
137
  chunks.append(combined.strip())
 
 
138
  else:
139
  chunks.extend(_split_by_sentence(section, chunk_size, overlap))
140
 
 
141
  chunks = _merge_small_chunks(chunks, min_len=200)
142
 
143
+ # --- Ensure overlap continuity ---
144
  final_chunks = []
145
  for i, ch in enumerate(chunks):
146
  if i == 0:
 
158
  # ==========================================================
159
  def _split_by_sentence(text, chunk_size=800, overlap=80):
160
  """Split by sentence punctuation to preserve semantics."""
161
+ sentences = re.split(r"(?<=[.!?])\s+", text)
162
  chunks, current = [], ""
163
  for sent in sentences:
164
  if len(current) + len(sent) + 1 <= chunk_size:
 
195
  if __name__ == "__main__":
196
  pdf_path = "sample.pdf"
197
  text = extract_text_from_pdf(pdf_path)
198
+ chunks = chunk_text(text)
199
  for i, c in enumerate(chunks[:5], 1):
200
  print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")