| | from docling.document_converter import DocumentConverter, PdfFormatOption |
| | from docling.datamodel.pipeline_options import PdfPipelineOptions |
| | from docling.datamodel.base_models import InputFormat |
| | import time |
| | import base64 |
| | import re |
| | from groq import Groq |
| | import os |
| | from dotenv import load_dotenv |
| | from pathlib import Path |
| |
|
| | load_dotenv() |
| |
|
| | def convert_pdf_to_md(pdf_path: str) -> str: |
| | """Convert PDF to MD with image summaries. Returns MD string. (Server-adapted from select_file)""" |
| | if not os.path.exists(pdf_path): |
| | raise ValueError(f"PDF not found: {pdf_path}") |
| |
|
| | |
| | pipeline_options = PdfPipelineOptions() |
| | pipeline_options.do_formula_enrichment = True |
| | pipeline_options.generate_picture_images = True |
| |
|
| | converter = DocumentConverter(format_options={ |
| | InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) |
| | }) |
| |
|
| | start_time = time.time() |
| | result = converter.convert(pdf_path) |
| | end_time = time.time() |
| |
|
| | |
| | md = result.document.export_to_markdown() |
| |
|
| | |
| | images_list = [] |
| |
|
| | for item, _ in result.document.iterate_items(): |
| | if item.label == "picture": |
| | image_data = item.image |
| | uri = str(image_data.uri) |
| |
|
| | |
| | match = re.match(r'data:image/(?P<type>.+);base64,(?P<data>.+)', uri) |
| | if match: |
| | img_type = match.group('type') |
| | img_bytes = base64.b64decode(match.group('data')) |
| |
|
| | |
| | images_list.append({ |
| | 'page': item.prov[0].page_no if item.prov else 'Unknown', |
| | 'label': item.label, |
| | 'type': img_type, |
| | 'bytes': img_bytes, |
| | 'uri': uri |
| | }) |
| |
|
| | |
| | client = Groq(api_key=os.environ.get("GROQ_API_KEY")) |
| |
|
| | prompt_template = """ |
| | You are an expert research assistant in Artificial Intelligence. |
| | Your task is to analyze and summarize a figure from a scientific paper. |
| | |
| | The figure may describe an overall architecture, workflow, plot, charts or experimental setup. |
| | Provide a clear, detailed summary that helps a reader understand the design without seeing the image. |
| | |
| | When summarizing if figure is model architecture, include: |
| | - The main purpose of the figure (what problem it addresses). |
| | - The overall structure (e.g., input/output, branches, modules, flows). |
| | - The key components (e.g., encoders, decoders, adapters, loss functions). |
| | - The interactions or data flow between components. |
| | - Any special innovations or unique design choices. |
| | if figure is charts, images or plot, analyze it. |
| | |
| | Format the summary inside **one section only**. |
| | Do not create multiple headers like ## or ###. |
| | Use bold or bullet points if needed. |
| | |
| | Now summarize the following figure: |
| | {image_caption_or_context} |
| | """ |
| |
|
| | image_summaries = [] |
| |
|
| | |
| | images = [(base64.b64encode(img['bytes']).decode('utf-8'), img['type']) for img in images_list] |
| |
|
| | for img_b64, img_type in images: |
| | try: |
| | |
| | img_data_url = f"data:image/{img_type};base64,{img_b64}" |
| |
|
| | completion = client.chat.completions.create( |
| | model="meta-llama/llama-4-scout-17b-16e-instruct", |
| | messages=[ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "text", "text": prompt_template}, |
| | {"type": "image_url", "image_url": {"url": img_data_url}} |
| | ] |
| | } |
| | ], |
| | temperature=0.0, |
| | max_completion_tokens=512, |
| | top_p=1, |
| | stream=False, |
| | ) |
| |
|
| | summary = completion.choices[0].message.content |
| | image_summaries.append(summary) |
| |
|
| | except Exception as e: |
| | print(f"Error processing image: {e}") |
| | image_summaries.append("Error summarizing image.") |
| |
|
| | |
| | |
| | placeholder = "<!-- image -->" |
| | if len(image_summaries) > 0: |
| | |
| | md_parts = md.split(placeholder) |
| | if len(md_parts) == len(image_summaries) + 1: |
| | updated_md = md_parts[0] |
| | for i in range(len(image_summaries)): |
| | |
| | updated_md += f"\n**Image Summary:**\n{image_summaries[i]}\n" + md_parts[i + 1] |
| | md = updated_md |
| | else: |
| | print("Warning: Number of placeholders doesn't match number of summaries.") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | return md |
| |
|
| | if __name__ == "__main__": |
| | |
| | pdf_path = r"E:\Study\AI\PE-CLIP.pdf" |
| | md = convert_pdf_to_md(pdf_path) |
| | print(md[:1000]) |