from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Mininglamp-2718/Mano-CUA-2.0-4B",
torch_dtype="auto",
device_map="auto",
)
processor = AutoProcessor.from_pretrained("Mininglamp-2718/Mano-CUA-2.0-4B")
img = Image.open("screenshot.png")
ratio = 1280 / img.width
img = img.resize((1280, int(img.height * ratio)), Image.LANCZOS)
task = "Click the search bar and type hello"
prompt_text = f"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
## Output Format
<action>action</action>
## Action Space
open_app(app_name='') # Open an application by name.
open_url(url='') # Open a URL in the browser.
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
type(content='') # type the content.
hotkey(key='') # Trigger a keyboard shortcut.
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left', amount='scroll_amount')
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
wait(duration='') # Sleep for specified duration (in seconds).
finish() # The task is completed.
stop(reason='') # If the item can not found in the image, give the reason
## User Instruction
{task}"""
messages = [
{{"role": "system", "content": "You are a helpful assistant."}},
{{"role": "user", "content": [
{{"type": "image", "image": img}},
{{"type": "text", "text": prompt_text}},
]}},
]
text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input], images=image_inputs, videos=video_inputs,
padding=True, return_tensors="pt",
).to(model.device)
output_ids = model.generate(**inputs, max_new_tokens=512, temperature=0.0, do_sample=False)
output_ids = output_ids[:, inputs.input_ids.shape[1]:]
output = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
print(output)