Shubham170793 commited on
Commit
35646e4
·
verified ·
1 Parent(s): 6d7ba5b

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +60 -21
src/ingestion.py CHANGED
@@ -1,43 +1,82 @@
 
1
  import fitz # PyMuPDF
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
-
4
 
 
 
 
5
  def extract_text_from_pdf(file_path: str) -> str:
6
  """
7
  Extracts text from a PDF file using PyMuPDF.
8
-
9
  Args:
10
  file_path (str): Path to the PDF file.
11
-
12
  Returns:
13
  str: The extracted text from the PDF.
14
  """
15
  text = ""
16
- # Open the PDF
17
  with fitz.open(file_path) as pdf:
18
- # Loop through each page
19
  for page in pdf:
20
- # Extract text from that page
21
- text += page.get_text()
22
  return text
23
 
24
 
25
-
26
-
27
- def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list:
 
28
  """
29
- Splits extracted text into smaller overlapping chunks.
 
30
 
31
  Args:
32
- text (str): The full extracted text.
33
- chunk_size (int): Max characters per chunk.
34
- chunk_overlap (int): Overlap between chunks.
35
 
36
  Returns:
37
- list: List of text chunks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
- splitter = RecursiveCharacterTextSplitter(
40
- chunk_size=chunk_size,
41
- chunk_overlap=chunk_overlap
42
- )
43
- return splitter.split_text(text)
 
1
+ import re
2
  import fitz # PyMuPDF
 
 
3
 
4
+ # -----------------------------
5
+ # TEXT EXTRACTION
6
+ # -----------------------------
7
  def extract_text_from_pdf(file_path: str) -> str:
8
  """
9
  Extracts text from a PDF file using PyMuPDF.
10
+
11
  Args:
12
  file_path (str): Path to the PDF file.
13
+
14
  Returns:
15
  str: The extracted text from the PDF.
16
  """
17
  text = ""
 
18
  with fitz.open(file_path) as pdf:
 
19
  for page in pdf:
20
+ text += page.get_text("text") # Extracts text from each page
 
21
  return text
22
 
23
 
24
+ # -----------------------------
25
+ # SMART CHUNKING (sentence-aware)
26
+ # -----------------------------
27
+ def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
28
  """
29
+ Splits extracted text into meaningful, overlapping, sentence-based chunks.
30
+ Optimized for Hugging Face Spaces (low memory & local inference).
31
 
32
  Args:
33
+ text (str): Extracted document text.
34
+ chunk_size (int): Max characters per chunk (default: 800).
35
+ overlap (int): Overlapping characters between chunks (default: 150).
36
 
37
  Returns:
38
+ list[str]: List of text chunks.
39
+ """
40
+ # Step 1. Clean and normalize whitespace
41
+ text = re.sub(r'\s+', ' ', text.strip())
42
+
43
+ # Step 2. Split into sentences (simple but effective heuristic)
44
+ sentences = re.split(r'(?<=[.!?])\s+', text)
45
+
46
+ chunks = []
47
+ current_chunk = ""
48
+
49
+ # Step 3. Build chunks by adding sentences until limit is reached
50
+ for sentence in sentences:
51
+ if len(current_chunk) + len(sentence) + 1 <= chunk_size:
52
+ current_chunk += " " + sentence
53
+ else:
54
+ # Save completed chunk
55
+ if current_chunk.strip():
56
+ chunks.append(current_chunk.strip())
57
+
58
+ # Create overlap (for context continuity)
59
+ overlap_part = current_chunk[-overlap:] if overlap > 0 else ""
60
+ current_chunk = overlap_part + " " + sentence
61
+
62
+ # Step 4. Add final chunk
63
+ if current_chunk.strip():
64
+ chunks.append(current_chunk.strip())
65
+
66
+ return chunks
67
+
68
+
69
+ # -----------------------------
70
+ # OPTIONAL DEBUG / SANITY CHECK
71
+ # -----------------------------
72
+ if __name__ == "__main__":
73
+ # Quick local test
74
+ sample_text = """
75
+ Artificial Intelligence is transforming industries.
76
+ Machine learning is a key subfield, driving automation and predictive analytics.
77
+ Neural networks power most modern AI applications today.
78
  """
79
+ chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
80
+ print("Chunks created:", len(chunks))
81
+ for i, c in enumerate(chunks, 1):
82
+ print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")