Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 4

Commit

35646e4

verified ·

1 Parent(s): 6d7ba5b

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +60 -21

src/ingestion.py CHANGED Viewed

@@ -1,43 +1,82 @@
 import fitz  # PyMuPDF
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 def extract_text_from_pdf(file_path: str) -> str:
     """
     Extracts text from a PDF file using PyMuPDF.
     Args:
         file_path (str): Path to the PDF file.
     Returns:
         str: The extracted text from the PDF.
     """
     text = ""
-    # Open the PDF
     with fitz.open(file_path) as pdf:
-        # Loop through each page
         for page in pdf:
-            # Extract text from that page
-            text += page.get_text()
     return text
-def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list:
     """
-    Splits extracted text into smaller overlapping chunks.
     Args:
-        text (str): The full extracted text.
-        chunk_size (int): Max characters per chunk.
-        chunk_overlap (int): Overlap between chunks.
     Returns:
-        list: List of text chunks.
     """
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap
-    )
-    return splitter.split_text(text)

+import re
 import fitz  # PyMuPDF
+# -----------------------------
+# TEXT EXTRACTION
+# -----------------------------
 def extract_text_from_pdf(file_path: str) -> str:
     """
     Extracts text from a PDF file using PyMuPDF.
     Args:
         file_path (str): Path to the PDF file.
     Returns:
         str: The extracted text from the PDF.
     """
     text = ""
     with fitz.open(file_path) as pdf:
         for page in pdf:
+            text += page.get_text("text")  # Extracts text from each page
     return text
+# -----------------------------
+# SMART CHUNKING (sentence-aware)
+# -----------------------------
+def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
     """
+    Splits extracted text into meaningful, overlapping, sentence-based chunks.
+    Optimized for Hugging Face Spaces (low memory & local inference).
     Args:
+        text (str): Extracted document text.
+        chunk_size (int): Max characters per chunk (default: 800).
+        overlap (int): Overlapping characters between chunks (default: 150).
     Returns:
+        list[str]: List of text chunks.
+    """
+    # Step 1. Clean and normalize whitespace
+    text = re.sub(r'\s+', ' ', text.strip())
+    # Step 2. Split into sentences (simple but effective heuristic)
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    # Step 3. Build chunks by adding sentences until limit is reached
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) + 1 <= chunk_size:
+            current_chunk += " " + sentence
+        else:
+            # Save completed chunk
+            if current_chunk.strip():
+                chunks.append(current_chunk.strip())
+            # Create overlap (for context continuity)
+            overlap_part = current_chunk[-overlap:] if overlap > 0 else ""
+            current_chunk = overlap_part + " " + sentence
+    # Step 4. Add final chunk
+    if current_chunk.strip():
+        chunks.append(current_chunk.strip())
+    return chunks
+# -----------------------------
+# OPTIONAL DEBUG / SANITY CHECK
+# -----------------------------
+if __name__ == "__main__":
+    # Quick local test
+    sample_text = """
+    Artificial Intelligence is transforming industries.
+    Machine learning is a key subfield, driving automation and predictive analytics.
+    Neural networks power most modern AI applications today.
     """
+    chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
+    print("Chunks created:", len(chunks))
+    for i, c in enumerate(chunks, 1):
+        print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")