| import gradio as gr |
| import os |
| import json |
| import base64 |
| import tempfile |
| from pathlib import Path |
|
|
| EXTRACTORS = ['pdf_plumber', 'py_pdf', 'docling', 'extractous', 'pypdfium2', 'pymupdf', 'pymupdf_llm'] |
|
|
| def add_page_breaks(text, page_offsets): |
| """Add page break markers to text based on page_offsets.""" |
| if not page_offsets: |
| return text |
| |
| result = [] |
| last_offset = 0 |
| for offset in page_offsets: |
| result.append(text[last_offset:offset]) |
| result.append("\n<---page-break--->\n") |
| last_offset = offset |
| |
| |
| if last_offset < len(text): |
| result.append(text[last_offset:]) |
| |
| return "".join(result) |
|
|
| class ExtractorComparer: |
| def __init__(self): |
| self.json_files = [] |
| self.current_index = 0 |
| self.current_data = None |
| self.temp_pdf_path = None |
| self.current_pdf_bytes = None |
| |
| def load_files(self, directory_path): |
| """Load all JSON files from the specified directory.""" |
| self.json_files = [] |
| try: |
| for filename in os.listdir(directory_path): |
| if filename.endswith('.json') or filename.endswith('.jsonl'): |
| self.json_files.append(os.path.join(directory_path, filename)) |
| |
| if self.json_files: |
| self.current_index = 0 |
| file_progress, annotation_status = self.get_progress_info() |
| return file_progress, annotation_status |
| else: |
| return "No JSON files found", "No files loaded" |
| except Exception as e: |
| return f"Error loading files: {str(e)}", "Error" |
| |
| def load_current_file(self): |
| """Load the current JSON file data.""" |
| if not self.json_files: |
| return None, "N/A", "N/A" |
| |
| try: |
| with open(self.json_files[self.current_index], 'r') as f: |
| self.current_data = json.load(f) |
| |
| |
| pdf_bytes = None |
| debug_info = "" |
| if 'pdf_plumber' in self.current_data: |
| plumber_data = self.current_data['pdf_plumber'] |
| if 'media' in plumber_data and plumber_data['media'] and isinstance(plumber_data['media'], list) and len(plumber_data['media']) > 0: |
| media_item = plumber_data['media'][0] |
| if 'media_bytes' in media_item and media_item['media_bytes']: |
| try: |
| pdf_bytes = base64.b64decode(media_item['media_bytes']) |
| self.current_pdf_bytes = pdf_bytes |
| except Exception as e: |
| debug_info = f"Error decoding media_bytes: {str(e)}" |
| |
| |
| if pdf_bytes: |
| if self.temp_pdf_path: |
| try: |
| os.remove(self.temp_pdf_path) |
| except: |
| pass |
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: |
| temp_file.write(pdf_bytes) |
| self.temp_pdf_path = temp_file.name |
| |
| |
| base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') |
| |
| |
| file_progress, annotation_status = self.get_progress_info() |
| |
| return base64_pdf, file_progress, annotation_status |
| else: |
| file_progress, annotation_status = self.get_progress_info() |
| return None, file_progress, annotation_status |
| except Exception as e: |
| return None, "Error loading file", "No annotation" |
| |
| def get_progress_info(self): |
| """Generate progress information and annotation status.""" |
| if not self.json_files: |
| return "No files loaded", "No annotation" |
| |
| current_file = self.json_files[self.current_index] |
| filename = Path(current_file).name |
| |
| |
| file_progress = f"File {self.current_index + 1} of {len(self.json_files)}: {filename}" |
| |
| |
| best_extractor_file = os.path.splitext(current_file)[0] + "_best.txt" |
| annotation_status = "Not annotated" |
| |
| if os.path.exists(best_extractor_file): |
| try: |
| with open(best_extractor_file, 'r') as f: |
| best_extractor = f.read().strip() |
| annotation_status = f"Best extractor: {best_extractor}" |
| except: |
| pass |
| |
| |
| annotated_count = 0 |
| for json_file in self.json_files: |
| best_file = os.path.splitext(json_file)[0] + "_best.txt" |
| if os.path.exists(best_file): |
| annotated_count += 1 |
| |
| file_progress = f"{file_progress} (Annotated: {annotated_count}/{len(self.json_files)})" |
| |
| return file_progress, annotation_status |
| |
| def get_extractor_text(self, extractor_name): |
| """Get text with page breaks for the specified extractor.""" |
| if not self.current_data or extractor_name not in self.current_data: |
| return "" |
| |
| extractor_data = self.current_data[extractor_name] |
| if 'text' not in extractor_data: |
| return f"No text found for {extractor_name}" |
| |
| text = extractor_data.get('text', '') |
| |
| |
| page_offsets = [] |
| if 'media' in extractor_data and extractor_data['media'] and len(extractor_data['media']) > 0: |
| media_item = extractor_data['media'][0] |
| if 'metadata' in media_item and 'pdf_metadata' in media_item['metadata'] and 'page_offsets' in media_item['metadata']['pdf_metadata']: |
| page_offsets = media_item['metadata']['pdf_metadata']['page_offsets'] |
| |
| return add_page_breaks(text, page_offsets) |
| |
| def next_pdf(self): |
| """Load the next PDF in the list.""" |
| if not self.json_files: |
| return None, "N/A", "N/A" |
| |
| self.current_index = (self.current_index + 1) % len(self.json_files) |
| return self.load_current_file() |
| |
| def prev_pdf(self): |
| """Load the previous PDF in the list.""" |
| if not self.json_files: |
| return None, "N/A", "N/A" |
| |
| self.current_index = (self.current_index - 1) % len(self.json_files) |
| return self.load_current_file() |
| |
| def set_best_extractor(self, extractor_name): |
| """Record that this extractor is the best for the current file.""" |
| if not self.json_files or not self.current_data: |
| return "N/A", "N/A" |
| |
| try: |
| |
| result_file = os.path.splitext(self.json_files[self.current_index])[0] + "_best.txt" |
| with open(result_file, 'w') as f: |
| f.write(extractor_name) |
| |
| |
| file_progress, annotation_status = self.get_progress_info() |
| |
| return file_progress, annotation_status |
| except Exception as e: |
| return "Error saving annotation", "No annotation" |
|
|
| def create_interface(): |
| comparer = ExtractorComparer() |
| |
| |
| custom_css = """ |
| .extraction-text textarea { |
| font-family: Arial, Helvetica, sans-serif !important; |
| font-size: 14px !important; |
| line-height: 1.5 !important; |
| } |
| """ |
| |
| with gr.Blocks(title="PDF Extractor Comparer", theme="soft", css=custom_css, head= |
| """ |
| <script src="https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.min.js"></script> |
| """ |
| ) as demo: |
| gr.Markdown("## PDF Extractor Comparer") |
| |
| with gr.Row(): |
| directory_input = gr.Textbox( |
| label="Path to JSON Directory", |
| placeholder="e.g., /path/to/your/json/files", |
| value="extraction/truncated" |
| ) |
| load_button = gr.Button("Load PDFs", variant="primary") |
| |
| |
| with gr.Row(): |
| |
| with gr.Column(scale=3): |
| |
| pdf_viewer_html = gr.HTML( |
| label="PDF Document", |
| value=''' |
| <div style="width:100%; height:700px; position:relative; border:1px solid #ddd;"> |
| <div id="pdf-container" style="width:100%; height:100%; overflow:auto;"></div> |
| <div id="pdf-fallback" style="position:absolute; top:0; left:0; width:100%; height:100%; |
| display:flex; align-items:center; justify-content:center; padding:20px; text-align:center; font-family: Arial, sans-serif;"> |
| Click "Load PDFs" to start viewing documents. |
| </div> |
| </div> |
| ''' |
| ) |
| |
| pdf_data_hidden = gr.Textbox(visible=False, elem_id="pdf_base64_data") |
| |
| |
| with gr.Column(scale=1): |
| |
| file_progress_output = gr.Textbox(label="File Progress", interactive=False) |
| annotation_status_output = gr.Textbox(label="Annotation Status", interactive=False) |
| |
| |
| with gr.Row(): |
| prev_button = gr.Button("⬅️ Previous", elem_id="prev_button") |
| next_button = gr.Button("Next ➡️", elem_id="next_button") |
| |
| |
| gr.Markdown("### Select Best Extractor") |
| extractor_buttons = [] |
| for extractor in EXTRACTORS: |
| button = gr.Button(extractor, variant="secondary") |
| extractor_buttons.append(button) |
| button.click( |
| comparer.set_best_extractor, |
| inputs=[gr.Textbox(value=extractor, visible=False)], |
| outputs=[file_progress_output, annotation_status_output] |
| ) |
| |
| |
| gr.Markdown("### Extractor Comparison") |
| |
| |
| with gr.Row(): |
| extractor1_dropdown = gr.Dropdown( |
| choices=EXTRACTORS, |
| label="Extractor 1", |
| value=EXTRACTORS[0] if EXTRACTORS else None |
| ) |
| extractor2_dropdown = gr.Dropdown( |
| choices=EXTRACTORS, |
| label="Extractor 2", |
| value=EXTRACTORS[1] if len(EXTRACTORS) > 1 else EXTRACTORS[0] if EXTRACTORS else None |
| ) |
| |
| |
| with gr.Row(): |
| extractor1_text = gr.Textbox( |
| label="Extractor 1 Output", |
| lines=15, |
| elem_classes=["extraction-text"] |
| ) |
| extractor2_text = gr.Textbox( |
| label="Extractor 2 Output", |
| lines=15, |
| elem_classes=["extraction-text"] |
| ) |
| |
| |
| load_button.click( |
| comparer.load_files, |
| inputs=[directory_input], |
| outputs=[file_progress_output, annotation_status_output] |
| ).then( |
| comparer.load_current_file, |
| outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor1_dropdown], |
| outputs=[extractor1_text] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor2_dropdown], |
| outputs=[extractor2_text] |
| ) |
| |
| prev_button.click( |
| comparer.prev_pdf, |
| outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor1_dropdown], |
| outputs=[extractor1_text] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor2_dropdown], |
| outputs=[extractor2_text] |
| ) |
| |
| next_button.click( |
| comparer.next_pdf, |
| outputs=[pdf_data_hidden, file_progress_output, annotation_status_output] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor1_dropdown], |
| outputs=[extractor1_text] |
| ).then( |
| comparer.get_extractor_text, |
| inputs=[extractor2_dropdown], |
| outputs=[extractor2_text] |
| ) |
| |
| extractor1_dropdown.change( |
| comparer.get_extractor_text, |
| inputs=[extractor1_dropdown], |
| outputs=[extractor1_text] |
| ) |
| |
| extractor2_dropdown.change( |
| comparer.get_extractor_text, |
| inputs=[extractor2_dropdown], |
| outputs=[extractor2_text] |
| ) |
| |
| |
| demo.load( |
| fn=None, |
| js=""" |
| function() { |
| console.log("Setting up PDF.js viewer"); |
| |
| // Configure PDF.js worker |
| if (window.pdfjsLib) { |
| window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js"; |
| console.log("PDF.js configured with worker"); |
| } else { |
| console.warn("PDF.js not found in head, attempting to load dynamically"); |
| // Fallback to load PDF.js dynamically if not in the head |
| const pdfJsScript = document.createElement('script'); |
| pdfJsScript.src = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.min.js"; |
| document.head.appendChild(pdfJsScript); |
| |
| pdfJsScript.onload = function() { |
| window.pdfjsLib.GlobalWorkerOptions.workerSrc = "https://unpkg.com/pdfjs-dist@3.11.174/build/pdf.worker.min.js"; |
| console.log("PDF.js loaded dynamically"); |
| }; |
| } |
| |
| // To track when we should force a refresh |
| let currentPdfHash = ""; |
| |
| // Function to render a PDF page |
| async function renderPage(pdf, pageNumber, container) { |
| try { |
| const page = await pdf.getPage(pageNumber); |
| |
| // Create page container |
| const pageContainer = document.createElement('div'); |
| pageContainer.className = 'pdf-page'; |
| pageContainer.style.position = 'relative'; |
| pageContainer.style.margin = '10px auto'; |
| pageContainer.style.boxShadow = '0 2px 5px rgba(0,0,0,0.2)'; |
| |
| // Create canvas for this page |
| const canvas = document.createElement('canvas'); |
| const context = canvas.getContext('2d'); |
| pageContainer.appendChild(canvas); |
| |
| // Set up viewport with scale based on container width |
| const containerWidth = container.clientWidth - 30; // Account for margins |
| const originalViewport = page.getViewport({ scale: 1 }); |
| const scale = containerWidth / originalViewport.width; |
| const viewport = page.getViewport({ scale }); |
| |
| // Set canvas dimensions |
| canvas.width = viewport.width; |
| canvas.height = viewport.height; |
| |
| // Render the PDF page into canvas context |
| await page.render({ |
| canvasContext: context, |
| viewport: viewport |
| }).promise; |
| |
| // Add to the container |
| container.appendChild(pageContainer); |
| |
| return true; |
| } catch (error) { |
| console.error(`Error rendering page ${pageNumber}:`, error); |
| return false; |
| } |
| } |
| |
| // Simple hash function for PDF data to detect changes |
| function hashData(str) { |
| let hash = 0; |
| if (str.length === 0) return hash; |
| for (let i = 0; i < Math.min(str.length, 10000); i++) { |
| const char = str.charCodeAt(i); |
| hash = ((hash << 5) - hash) + char; |
| hash = hash & hash; |
| } |
| // Also include the length as PDFs with same start can be different |
| return `${hash}_${str.length}`; |
| } |
| |
| // Function to display PDF from base64 data |
| async function displayPdfFromBase64(base64Data) { |
| try { |
| if (!base64Data || base64Data.length < 100) { |
| console.log("No valid PDF data received"); |
| document.getElementById('pdf-fallback').style.display = 'flex'; |
| document.getElementById('pdf-container').innerHTML = ''; |
| return; |
| } |
| |
| // Check if this is the same PDF we already have displayed |
| const dataHash = hashData(base64Data); |
| if (dataHash === currentPdfHash) { |
| console.log("Same PDF already displayed, skipping render"); |
| return; |
| } |
| |
| // Update the current PDF hash |
| currentPdfHash = dataHash; |
| console.log("PDF changed, rendering new document"); |
| |
| // Check if PDF.js is loaded |
| if (!window.pdfjsLib) { |
| console.warn("PDF.js not loaded yet, waiting..."); |
| document.getElementById('pdf-fallback').innerHTML = |
| '<div style="font-family: Arial, sans-serif;">Loading PDF viewer...</div>'; |
| setTimeout(() => displayPdfFromBase64(base64Data), 500); |
| return; |
| } |
| |
| // Convert base64 to array buffer |
| const binaryString = atob(base64Data); |
| const bytes = new Uint8Array(binaryString.length); |
| for (let i = 0; i < binaryString.length; i++) { |
| bytes[i] = binaryString.charCodeAt(i); |
| } |
| |
| // Clear existing content |
| const container = document.getElementById('pdf-container'); |
| container.innerHTML = ''; |
| document.getElementById('pdf-fallback').style.display = 'none'; |
| |
| // Load and render the PDF |
| try { |
| // Show loading indicator |
| const loadingIndicator = document.createElement('div'); |
| loadingIndicator.style.padding = '20px'; |
| loadingIndicator.style.textAlign = 'center'; |
| loadingIndicator.innerText = 'Loading PDF...'; |
| container.appendChild(loadingIndicator); |
| |
| // Load document |
| const loadingTask = window.pdfjsLib.getDocument({ data: bytes }); |
| const pdf = await loadingTask.promise; |
| |
| // Clear the loading indicator |
| container.innerHTML = ''; |
| |
| console.log(`PDF loaded with ${pdf.numPages} pages`); |
| |
| // Render all pages |
| const pagePromises = []; |
| for (let i = 1; i <= pdf.numPages; i++) { |
| pagePromises.push(renderPage(pdf, i, container)); |
| } |
| |
| // Wait for all pages to render |
| await Promise.all(pagePromises); |
| console.log("All pages rendered"); |
| |
| // Scroll to top |
| container.scrollTop = 0; |
| |
| } catch (error) { |
| console.error("Error loading PDF:", error); |
| document.getElementById('pdf-fallback').innerHTML = |
| `<div style="color: red; font-family: Arial, sans-serif;"> |
| Error loading PDF: ${error.message || 'Unknown error'} |
| </div>`; |
| document.getElementById('pdf-fallback').style.display = 'flex'; |
| currentPdfHash = ""; // Reset hash to allow retry |
| } |
| } catch (error) { |
| console.error("Error processing PDF data:", error); |
| document.getElementById('pdf-fallback').innerHTML = |
| `<div style="color: red; font-family: Arial, sans-serif;"> |
| Error processing PDF: ${error.message || 'Unknown error'} |
| </div>`; |
| document.getElementById('pdf-fallback').style.display = 'flex'; |
| currentPdfHash = ""; // Reset hash to allow retry |
| } |
| } |
| |
| // Check for PDF data |
| function setupPdfListener() { |
| const dataElement = document.getElementById('pdf_base64_data'); |
| if (!dataElement) { |
| console.log("PDF data element not found, will retry"); |
| setTimeout(setupPdfListener, 1000); |
| return; |
| } |
| |
| const textarea = dataElement.querySelector('textarea'); |
| if (!textarea) { |
| console.log("Textarea not found, will retry"); |
| setTimeout(setupPdfListener, 1000); |
| return; |
| } |
| |
| console.log("Found PDF data element, setting up listeners"); |
| |
| // Display initial data if available |
| if (textarea.value && textarea.value.length > 100) { |
| displayPdfFromBase64(textarea.value); |
| } |
| |
| // Use both an observer and polling for robustness |
| // 1. Create MutationObserver to watch for value changes |
| const observer = new MutationObserver((mutations) => { |
| for (const mutation of mutations) { |
| if (textarea.value && textarea.value.length > 100) { |
| displayPdfFromBase64(textarea.value); |
| break; |
| } |
| } |
| }); |
| |
| // Observe the textarea for changes |
| observer.observe(textarea, { |
| attributes: true, |
| characterData: true, |
| subtree: true, |
| childList: true |
| }); |
| |
| // 2. Also use polling as a fallback |
| setInterval(() => { |
| if (textarea.value && textarea.value.length > 100) { |
| displayPdfFromBase64(textarea.value); |
| } |
| }, 1000); |
| |
| // Monitor the next/prev buttons to force PDF refresh |
| const prevButton = document.getElementById('prev_button'); |
| const nextButton = document.getElementById('next_button'); |
| |
| if (prevButton) { |
| prevButton.addEventListener('click', () => { |
| console.log("Prev button clicked, forcing PDF refresh"); |
| currentPdfHash = ""; // Reset hash to force refresh |
| }); |
| } |
| |
| if (nextButton) { |
| nextButton.addEventListener('click', () => { |
| console.log("Next button clicked, forcing PDF refresh"); |
| currentPdfHash = ""; // Reset hash to force refresh |
| }); |
| } |
| } |
| |
| // Start checking for PDF data |
| setTimeout(setupPdfListener, 1000); |
| |
| // Add keyboard shortcuts |
| document.addEventListener('keydown', function(event) { |
| if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA') { |
| return; |
| } |
| |
| var buttonId = null; |
| if (event.key === 'ArrowLeft') buttonId = 'prev_button'; |
| else if (event.key === 'ArrowRight') buttonId = 'next_button'; |
| |
| if (buttonId) { |
| var button = document.getElementById(buttonId); |
| if (button) { |
| event.preventDefault(); |
| button.click(); |
| } |
| } |
| }); |
| } |
| """ |
| ) |
| |
| return demo |
|
|
| if __name__ == "__main__": |
| demo = create_interface() |
| demo.launch() |