Spaces:
Runtime error
Runtime error
| import torch | |
| import streamlit as st | |
| from PIL import Image | |
| from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor | |
| def run_prediction(sample): | |
| global pretrained_model, processor, task_prompt | |
| if isinstance(sample, dict): | |
| # prepare inputs | |
| pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) | |
| else: # sample is an image | |
| # prepare encoder inputs | |
| pixel_values = processor(image, return_tensors="pt").pixel_values | |
| decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # run inference | |
| outputs = pretrained_model.generate( | |
| pixel_values.to(device), | |
| decoder_input_ids=decoder_input_ids.to(device), | |
| max_length=pretrained_model.decoder.config.max_position_embeddings, | |
| early_stopping=True, | |
| pad_token_id=processor.tokenizer.pad_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| use_cache=True, | |
| num_beams=1, | |
| bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| return_dict_in_generate=True, | |
| ) | |
| # process output | |
| prediction = processor.batch_decode(outputs.sequences)[0] | |
| # post-processing | |
| if "cord" in task_prompt: | |
| prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token | |
| prediction = processor.token2json(prediction) | |
| # load reference target | |
| if isinstance(sample, dict): | |
| target = processor.token2json(sample["target_sequence"]) | |
| else: | |
| target = "<not_provided>" | |
| return prediction, target | |
| task_prompt = f"<s>" | |
| st.text(''' | |
| This is OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset. | |
| The original 🍩 implementation can be found on: https://github.com/clovaai/donut | |
| ''') | |
| with st.sidebar: | |
| information = st.radio( | |
| "What information inside the are you interested in?", | |
| ('Receipt Summary', 'Receipt Menu Details', 'Extract all!')) | |
| receipt = st.selectbox('Pick one receipt', ['1', '2', '3', '4', '5', '6'], index=5) | |
| st.text(f'{information} mode is ON!\nTarget receipt: {receipt}\n(opening image @:./img/receipt-{receipt}.png)') | |
| image = Image.open(f"./img/receipt-{receipt}.jpg") | |
| st.image(image, caption='Your target receipt') | |
| st.text(f'baking the 🍩...') | |
| processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") | |
| pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pretrained_model.to(device) | |
| pretrained_model.encoder.to(torch.bfloat16) | |
| pretrained_model.eval() | |
| st.text(f'parsing receipt..') | |
| parsed_receipt_info = run_prediction(image) | |
| st.text(f'\nRaw output:\n{parsed_receipt_info}') |