Spaces:

Souvik3333
/

Nanonets-ocr-s

Runtime error

App Files Files Community

Souvik3333 commited on Jun 13, 2025

Commit

6698955

verified ·

1 Parent(s): 3c1b4ef

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -32

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gradio as gr
 from PIL import Image
-from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
 import torch
 import spaces
 model_path = "nanonets/Nanonets-OCR-s"
@@ -33,6 +34,65 @@ def process_tags(content: str) -> str:
     return content
 @spaces.GPU()
 def ocr_image_gradio(image, max_tokens=4096):
     """Process image through Nanonets OCR model for Gradio interface"""
@@ -88,6 +148,9 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
                 💻 GitHub Repository
             </a>
         </div>
     </div>
     """)
@@ -108,9 +171,16 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
             )
             extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
         with gr.Column(scale=2):
             output_text = gr.Markdown(
-                label="Formatted model prediction",
                 latex_delimiters=[
                     {"left": "$$", "right": "$$", "display": True},
                     {"left": "$", "right": "$", "display": False},
@@ -124,16 +194,16 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
                 show_copy_button=True,
             )
-    # Event handlers
     extract_btn.click(
-        fn=ocr_image_gradio,
         inputs=[image_input, max_tokens_slider],
         outputs=output_text,
         show_progress=True
     )
     image_input.change(
-        fn=ocr_image_gradio,
         inputs=[image_input, max_tokens_slider],
         outputs=output_text,
         show_progress=True
@@ -142,32 +212,42 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
     # Add model information section
     with gr.Accordion("About Nanonets-OCR-s", open=False):
         gr.Markdown("""
-        ## Nanonets-OCR-s
-        Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
-        It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
-        for downstream processing by Large Language Models (LLMs).
-        ### Key Features
-        - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
-          It distinguishes between inline ($...$) and display ($$...$$) equations.
-        - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
-          for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
-          style, and context.
-        - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
-          This is crucial for processing legal and business documents.
-        - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
-        - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒)
-          for consistent and reliable processing.
-        - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
-          and HTML table formats.
-        """)
 if __name__ == "__main__":
-    demo.queue().launch()

 import gradio as gr
 from PIL import Image
+from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 import torch
 import spaces
+import threading
 model_path = "nanonets/Nanonets-OCR-s"
     return content
+@spaces.GPU()
+def ocr_image_gradio_stream(image, max_tokens=4096):
+    """Process image through Nanonets OCR model with streaming output"""
+    if image is None:
+        yield "Please upload an image."
+        return
+    try:
+        prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
+        # Convert PIL image if needed
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ]},
+        ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
+        inputs = inputs.to(model.device)
+        # Set up streaming
+        streamer = TextIteratorStreamer(
+            tokenizer=tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True
+        )
+        generation_kwargs = {
+            **inputs,
+            "max_new_tokens": max_tokens,
+            "do_sample": False,
+            "streamer": streamer,
+        }
+        # Start generation in a separate thread
+        generation_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+        generation_thread.start()
+        # Stream the output
+        partial_output = ""
+        for new_token in streamer:
+            partial_output += new_token
+            processed_output = process_tags(partial_output)
+            yield processed_output
+        # Ensure thread completes
+        generation_thread.join()
+    except Exception as e:
+        yield f"Error processing image: {str(e)}"
+# Non-streaming version as fallback
 @spaces.GPU()
 def ocr_image_gradio(image, max_tokens=4096):
     """Process image through Nanonets OCR model for Gradio interface"""
                 💻 GitHub Repository
             </a>
         </div>
+        <p style="font-size: 0.9em; color: #10b981; font-weight: 500;">
+            ✨ Now with streaming output and support for 4 concurrent uploads!
+        </p>
     </div>
     """)
             )
             extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
+            gr.Markdown("""
+            **💡 Tips:**
+            - Upload supports concurrent processing of up to 4 images
+            - Results stream in real-time as they're generated
+            - Automatic processing starts when you upload an image
+            """)
         with gr.Column(scale=2):
             output_text = gr.Markdown(
+                label="Streaming model prediction",
                 latex_delimiters=[
                     {"left": "$$", "right": "$$", "display": True},
                     {"left": "$", "right": "$", "display": False},
                 show_copy_button=True,
             )
+    # Event handlers with streaming
     extract_btn.click(
+        fn=ocr_image_gradio_stream,
         inputs=[image_input, max_tokens_slider],
         outputs=output_text,
         show_progress=True
     )
     image_input.change(
+        fn=ocr_image_gradio_stream,
         inputs=[image_input, max_tokens_slider],
         outputs=output_text,
         show_progress=True
     # Add model information section
     with gr.Accordion("About Nanonets-OCR-s", open=False):
         gr.Markdown("""
+## Nanonets-OCR-s
+Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
+It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
+for downstream processing by Large Language Models (LLMs).
+### Key Features
+- **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
+    It distinguishes between inline `($...$)` and display `($$...$$)` equations.
+- **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
+    for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
+    style, and context.
+- **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
+    This is crucial for processing legal and business documents.
+- **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
+- **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, ☑, ☒)
+    for consistent and reliable processing.
+- **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
+    and HTML table formats.
+""")
 if __name__ == "__main__":
+    # Configure for concurrent processing with streaming support
+    demo.queue(
+        max_size=20,  # Maximum queue size
+        concurrency_count=4,  # Allow 4 concurrent requests
+        status_update_rate=0.1,  # Update status every 100ms for better streaming experience
+    ).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False
+    )