| | |
| | |
| | |
| | |
| | |
| | |
| | import gradio as gr |
| | from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils |
| | import torch |
| | import spaces |
| |
|
| | |
| | device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| | print(f"Using device: {device}") |
| |
|
| | |
| | model_id = "eltorio/IDEFICS3_ROCO" |
| | base_model_path = "HuggingFaceM4/Idefics3-8B-Llama3" |
| |
|
| | |
| | processor = AutoProcessor.from_pretrained(base_model_path, trust_remote_code=True) |
| |
|
| | |
| | model = Idefics3ForConditionalGeneration.from_pretrained( |
| | base_model_path, torch_dtype=torch.bfloat16 |
| | ).to(device) |
| |
|
| | |
| | model.load_adapter(model_id, device_map="auto") |
| |
|
| | |
| | @spaces.GPU |
| | def infere(image): |
| | """ |
| | Generate a description of a medical image. |
| | |
| | Args: |
| | - image (PIL Image): The medical image to describe. |
| | |
| | Returns: |
| | - generated_texts (List[str]): A list containing the generated description. |
| | """ |
| |
|
| | |
| | messages = [ |
| | { |
| | "role": "system", |
| | "content": [ |
| | {"type": "text", "text": "You are a valuable medical doctor and you are looking at an image of your patient."}, |
| | ] |
| | }, |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image"}, |
| | {"type": "text", "text": "What do we see in this image?"}, |
| | ] |
| | }, |
| | ] |
| |
|
| | |
| | prompt = processor.apply_chat_template(messages, add_generation_prompt=True) |
| |
|
| | |
| | inputs = processor(text=prompt, images=[image], return_tensors="pt") |
| |
|
| | |
| | inputs = {k: v.to(device) for k, v in inputs.items()} |
| |
|
| | |
| | generated_ids = model.generate(**inputs, max_new_tokens=100) |
| |
|
| | |
| | generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) |
| |
|
| | return generated_texts |
| |
|
| | |
| | title = f"<a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO</a>: Medical Image to Text <b>running on {device}</b>" |
| | desc = "This model generates a description of a medical image.<br><b>Note: No affiliation with original author. This is a ZeroGPU-enabled duplicate of <a href='https://huggingface.co/spaces/eltorio/IDEFICS3_ROCO'>spaces/eltorio/IDEFICS3_ROCO</a> to support accelerated inference. Please direct your citations and likes to the original work.</b>" |
| |
|
| | device_desc = f"This model is running on {device} 🚀." if device == 'cuda' else f"🐢 This model is running on {device} it will be very (very) slow. If you can donate some GPU time it will be usable 🐢. <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO/discussions'>Please contact us.</a>" |
| |
|
| | |
| | long_desc = f"This demo is based on the <a href='https://huggingface.co/eltorio/IDEFICS3_ROCO'>IDEFICS3_ROCO model</a>, which is a multimodal model that can generate text from images. It has been fine-tuned on <a href='https://huggingface.co/datasets/eltorio/ROCO-radiology'>eltorio/ROCO-radiology</a> a dataset of medical images and can generate descriptions of medical images. Try uploading an image of a medical image and see what the model generates!<br><b>{device_desc}</b><br> 2024 - Ronan Le Meillat" |
| |
|
| | |
| | radiotest = gr.Interface(fn=infere, inputs="image", outputs="text", title=title, |
| | description=desc, article=long_desc) |
| |
|
| | |
| | radiotest.launch() |