Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| from standalone_velvet import setup_models | |
| models_dict = setup_models("visual_bloom.torch") | |
| visual_bloom = models_dict["visual_bloom"].to('cuda') | |
| tokenizer = models_dict["tokenizer"] | |
| image_feature_collator = models_dict["image_feature_collator"] | |
| def run_inference(text_input, image_input): | |
| image_features, image_attentions = image_feature_collator([image_input]) | |
| instruction_inputs = tokenizer([text_input], return_tensors="pt") | |
| language_output = visual_bloom.generate( | |
| image_features.to('cuda'), | |
| image_attentions.to('cuda'), | |
| instruction_inputs["input_ids"].to('cuda'), | |
| instruction_inputs["attention_mask"].to('cuda'), | |
| ) | |
| human_output = tokenizer.decode(language_output[0], skip_special_tokens=True) | |
| return human_output.split(".")[0] | |
| if __name__ == "__main__": | |
| markdown = """ | |
| # Quick introduction | |
| We have proposed a prompting vision language model. | |
| The model can caption images and answer questions related to images. | |
| It is trained on CC3M, COCO, VQAv2, OK-VQA, TextCaps, TextVQA. | |
| As the result of using Google Translate, | |
| these datasets collectively contain millions of image-text pairs in English and Vietnamese. | |
| For further details, please refer to [Velvet](https://github.com/dinhanhx/velvet?tab=readme-ov-file#introduction). | |
| # Usage | |
| ## Run with pre-defined examples | |
| 1. Scroll to bottom of the page to see the examples. | |
| 2. Click one of them. | |
| 3. Click the `Run Inference` button. | |
| ## Run with user-defined inputs | |
| ### 1. Prepare text input | |
| Image captioning: | |
| - `Generate caption in en:` | |
| - `Generate caption in vi:` | |
| Visual question answering: | |
| - `Generate answer in en: <question>?` | |
| - `Generate answer in vi: <question>?` | |
| Don't forget to replace `<question>` with your own question either in English or Vietnamese. | |
| To write the prompt, one can refer to the examples at the bottom of the page. | |
| ### 2. Prepare image input | |
| You can do as said in Image Input box. Wide range of image types are supported by PIL. | |
| ### 3. Click the `Run Inference` button | |
| """ | |
| examples = [ | |
| ["Generate caption in en:", "examples/cat.png"], | |
| ["Generate caption in vi:", "examples/cat.png"], | |
| ["Generate answer in en: what is the color of the cat?", "examples/cat.png"], | |
| ["Generate answer in vi: màu sắc của con mèo là gì?", "examples/cat.png"], | |
| ] | |
| with gr.Blocks() as demo: | |
| gr.Markdown(markdown) | |
| text_input = gr.Textbox(label="Text Input") | |
| image_input = gr.Image(label="Image Input", type="pil") | |
| text_output = gr.Textbox(label="Text Output") | |
| infer_button = gr.Button("Run Inference") | |
| infer_button.click( | |
| run_inference, inputs=[text_input, image_input], outputs=text_output | |
| ) | |
| examples = gr.Examples( | |
| examples=examples, | |
| inputs=[text_input, image_input], | |
| ) | |
| demo.launch() | |