import torch
from PIL import Image
from peft import PeftModel
from transformers import AutoModelForImageTextToText, AutoProcessor
BASE = "Qwen/Qwen3.5-2B"
ADAPTER = "jhanwarsid/qwen3.5-2b-omr-lora"
CATEGORIES = {"<blank>", "<strikethrough>", "<unclear>"}
PROMPTS = {
"registration_no": "This image is a single field cropped from a student OMR sheet containing a handwritten registration number (digits and letters, e.g. 2572U00739). Transcribe it exactly as written, outputting only the value with no spaces, quotes, labels, or explanation. If the field is empty with nothing written, output <blank>. If a value was written and then struck through or crossed out, output <strikethrough>.",
"roll_no": "This image is a single field cropped from a student OMR sheet containing a handwritten roll number (digits only, e.g. 7200739). Transcribe it exactly as written, outputting only the value with no spaces, quotes, labels, or explanation. If the field is empty with nothing written, output <blank>. If a value was written and then struck through or crossed out, output <strikethrough>.",
"course_code": "This image is a single field cropped from a student OMR sheet containing a handwritten course code (a letter followed by digits, e.g. U12028). Transcribe it exactly as written, outputting only the value with no spaces, quotes, labels, or explanation. If the field is empty with nothing written, output <blank>. If a value was written and then struck through or crossed out, output <strikethrough>.",
"marks_obtained": "This image is a single field cropped from a student OMR sheet containing the handwritten marks obtained (a number, e.g. 45). Transcribe it exactly as written, outputting only the value with no spaces, quotes, labels, or explanation. If the field is empty with nothing written, output <blank>. If a value was written and then struck through or crossed out, output <strikethrough>.",
}
processor = AutoProcessor.from_pretrained(BASE, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
BASE, dtype=torch.bfloat16, trust_remote_code=True
)
model = PeftModel.from_pretrained(model, ADAPTER).eval()
def read_field(image_path: str, field: str, max_new_tokens: int = 24) -> str:
img = Image.open(image_path).convert("RGB")
messages = [{
"role": "user",
"content": [
{"type": "image", "image": img},
{"type": "text", "text": PROMPTS[field]},
],
}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[[img]], return_tensors="pt").to(model.device)
out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
value = processor.tokenizer.decode(
out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True
).strip()
return value
print(read_field("roll_no.jpg", "roll_no"))