from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel
from PIL import Image
import torch
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
ADAPTER_ID = "lgtk/qwen25vl-3b-modi-lora"
bnb = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
base = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID, quantization_config=bnb, device_map="auto")
model = PeftModel.from_pretrained(base, ADAPTER_ID)
model.eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, max_pixels=512 * 28 * 28)
PROMPT = (
"This image contains handwritten text in Modi script, a historical cursive "
"script used to write the Marathi language. "
"Transliterate the text in this image into Devanagari script. "
"Output only the Devanagari text, with no explanation."
)
image = Image.open("your_modi_image.jpg").convert("RGB")
messages = [{
"role": "user",
"content": [
{{"type": "image", "image": image}},
{{"type": "text", "text": PROMPT}},
],
}]
text_in = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text_in], images=[image], return_tensors="pt").to(model.device)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
result = processor.batch_decode(
out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
)[0].strip()
print(result)