ZipRerank API & Inference Endpoint

Model details

Table

Architecture	`Qwen3VLForConditionalGeneration`
Base model	`Qwen/Qwen3-VL-8B-Instruct`
Parameters	~8B
Precision	`bfloat16`
Max window size	20 pages per forward pass
Training data	MMDocIR training set + RankZephyr data

Installation

bash
pip install "transformers>=4.57" accelerate torch torchvision pillow pymupdf
# Optional but strongly recommended for fast inference:
pip install flash-attn --no-build-isolation

Quick start

The snippet below is self-contained. It:

Renders a PDF to page images with PyMuPDF (you can also pass your own PIL.Image list).
Builds a RankGPT-style prompt that asks the model to rank pages A–T.
Terminates the prompt with a [ token so the first predicted token is a letter.
Reads the logits at that position for each letter and sorts pages by score.

python
import fitz  # PyMuPDF
import torch
from PIL import Image
from transformers import AutoProcessor
from transformers.models.qwen3_vl import Qwen3VLForConditionalGeneration

MODEL_ID = "mtri-admin/ZipRerank"

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",  # or "sdpa" if flash-attn is unavailable
    trust_remote_code=True,
).eval()
tokenizer = processor.tokenizer


def pdf_to_images(pdf_path: str, max_size: int = 1024):
    """Render every page so the longest edge is at most ``max_size`` pixels."""
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        scale = max_size / max(page.rect.width, page.rect.height)
        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
        images.append(Image.frombytes("RGB", (pix.width, pix.height), pix.samples))
    doc.close()
    return images


def create_ranking_prompt(query: str, num_passages: int) -> str:
    lines = [
        "You are RankGPT, an intelligent assistant that can rank passages "
        "based on their relevancy to the query.",
        "",
        f"I will provide you with {num_passages} passages as images.",
        "Rank the passages based on their relevance to the search query.",
        "",
        "The images are provided in order: "
        + ", ".join(
            f"Picture {i + 1} is passage [{chr(ord('A') + i)}]"
            for i in range(num_passages)
        )
        + ".",
        "",
        f"Search Query: {query}",
        "",
        "Rank the passages above based on their relevance to the search query.",
        "The passages should be listed in descending order using identifiers.",
        "The most relevant passages should be listed first.",
        "The output format should be [A] > [B], etc.",
        "Only output the ranking results, do not say anything else.",
    ]
    return "
".join(lines)


@torch.no_grad()
def rerank_window(query: str, images):
    """Rank up to 20 page images in a single forward pass.

    Returns a list of 0-based indices into ``images``, ordered best-first.
    """
    assert 1 <= len(images) <= 20, "Window size must be between 1 and 20."
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": create_ranking_prompt(query, len(images))}]
                   + [{"type": "image", "image": img} for img in images],
    }]
    inputs = processor.apply_chat_template(
        [messages],
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
    )
    # Force the first predicted token to be a letter by appending "["
    prompt_ids = inputs["input_ids"][0].tolist()
    prompt_ids.append(tokenizer.encode("[", add_special_tokens=False)[0])
    input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=model.device)

    logits = model(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        pixel_values=inputs["pixel_values"].to(model.device),
        image_grid_thw=inputs["image_grid_thw"].to(model.device),
    ).logits[0, -1, :]

    letter_ids = [
        tokenizer.encode(chr(ord("A") + i), add_special_tokens=False)[0]
        for i in range(len(images))
    ]
    scores = [logits[tid].item() for tid in letter_ids]
    return sorted(range(len(images)), key=lambda i: scores[i], reverse=True)


pages = pdf_to_images("report.pdf", max_size=1024)
ranking = rerank_window("What is the company revenue?", pages[:20])
print("Best-first page indices:", ranking)

Sliding window for long documents

For documents with more than 20 pages, slide a window from the end of the list toward the beginning, progressively bubbling the most relevant pages to the front. Typical defaults are window_size=20, stride=10 (50% overlap).

python
def rerank(query, images, window_size=20, stride=10):
    n = len(images)
    ws = min(window_size, n)
    st = min(stride, n)

    if n <= ws:
        return rerank_window(query, images)

    indices = list(range(n))
    cur = list(images)
    end, start = n, n - ws
    while end > 0 and start + st != 0:
        start = max(start, 0)
        ranked = rerank_window(query, cur[start:end])
        new_indices = [indices[start + p] for p in ranked]
        new_images = [cur[start + p] for p in ranked]
        for i, (idx, img) in enumerate(zip(new_indices, new_images)):
            indices[start + i] = idx
            cur[start + i] = img
        end -= st
        start -= st
    return indices


ranking = rerank("What is the company revenue?", pages)

Tip. For maximum throughput on long documents, add a content-addressed LRU cache around model.model.get_image_features(...) so that overlapping windows reuse ViT embeddings across calls — each page image then needs to be encoded by the vision tower at most once per query.

Using your own images

rerank_window / rerank accept any list of PIL.Image.Image. If you already have page images (e.g. from pdf2image, a screenshot pipeline, or a document layout tool), you can skip pdf_to_images entirely:

python
from PIL import Image

images = [Image.open(p).convert("RGB") for p in ["page1.png", "page2.png", "page3.png"]]
ranking = rerank("architecture diagram", images)

How it works

Prompt construction — a RankGPT-style prompt asks the model to rank the pages (labeled A–T) by relevance to the query.
Logits scoring — one forward pass; the logit for each letter token at the last position is the relevance score for that page.
Sliding window — for n > window_size, a window slides from the end of the list toward the start, progressively reranking overlapping slices.

Intended use and limitations

Intended use. Reranking of candidate document pages for tasks such as visual document question answering, enterprise document search, and RAG over PDFs. Works either as the second stage of a retrieve-then-rerank pipeline, or as a standalone sliding-window reranker over an arbitrary list of page images.

Out-of-scope. ZipRerank is not a first-stage retriever: running it over every page of a large corpus is expensive and unnecessary — use a cheap retriever first, then rerank the top-K pages with ZipRerank.

Limitations.

Training focused on English documents; multilingual performance has not been evaluated, so results on non-English content may vary.
The window size is capped at 20 pages per forward pass (letters A–T); longer documents rely on the sliding-window procedure described above.

Citation

If you find ZipRerank useful, please cite:

bibtex
@article{sun2026very,
  title={Very Efficient Listwise Multimodal Reranking for Long Documents},
  author={Sun, Yiqun and Wei, Pengfei and Hsieh, Lawrence B},
  journal={arXiv preprint arXiv:2605.11864},
  year={2026}
}

Architecture

Qwen3VLForConditionalGeneration

Base model

Qwen/Qwen3-VL-8B-Instruct

Parameters

~8B

Precision

bfloat16

Max window size

20 pages per forward pass

Training data

MMDocIR training set + RankZephyr data

python

import fitz  # PyMuPDF
import torch
from PIL import Image
from transformers import AutoProcessor
from transformers.models.qwen3_vl import Qwen3VLForConditionalGeneration

MODEL_ID = "mtri-admin/ZipRerank"

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",  # or "sdpa" if flash-attn is unavailable
    trust_remote_code=True,
).eval()
tokenizer = processor.tokenizer


def pdf_to_images(pdf_path: str, max_size: int = 1024):
    """Render every page so the longest edge is at most ``max_size`` pixels."""
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        scale = max_size / max(page.rect.width, page.rect.height)
        pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
        images.append(Image.frombytes("RGB", (pix.width, pix.height), pix.samples))
    doc.close()
    return images


def create_ranking_prompt(query: str, num_passages: int) -> str:
    lines = [
        "You are RankGPT, an intelligent assistant that can rank passages "
        "based on their relevancy to the query.",
        "",
        f"I will provide you with {num_passages} passages as images.",
        "Rank the passages based on their relevance to the search query.",
        "",
        "The images are provided in order: "
        + ", ".join(
            f"Picture {i + 1} is passage [{chr(ord('A') + i)}]"
            for i in range(num_passages)
        )
        + ".",
        "",
        f"Search Query: {query}",
        "",
        "Rank the passages above based on their relevance to the search query.",
        "The passages should be listed in descending order using identifiers.",
        "The most relevant passages should be listed first.",
        "The output format should be [A] > [B], etc.",
        "Only output the ranking results, do not say anything else.",
    ]
    return "
".join(lines)


@torch.no_grad()
def rerank_window(query: str, images):
    """Rank up to 20 page images in a single forward pass.

    Returns a list of 0-based indices into ``images``, ordered best-first.
    """
    assert 1 <= len(images) <= 20, "Window size must be between 1 and 20."
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": create_ranking_prompt(query, len(images))}]
                   + [{"type": "image", "image": img} for img in images],
    }]
    inputs = processor.apply_chat_template(
        [messages],
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt",
    )
    # Force the first predicted token to be a letter by appending "["
    prompt_ids = inputs["input_ids"][0].tolist()
    prompt_ids.append(tokenizer.encode("[", add_special_tokens=False)[0])
    input_ids = torch.tensor([prompt_ids], dtype=torch.long, device=model.device)

    logits = model(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        pixel_values=inputs["pixel_values"].to(model.device),
        image_grid_thw=inputs["image_grid_thw"].to(model.device),
    ).logits[0, -1, :]

    letter_ids = [
        tokenizer.encode(chr(ord("A") + i), add_special_tokens=False)[0]
        for i in range(len(images))
    ]
    scores = [logits[tid].item() for tid in letter_ids]
    return sorted(range(len(images)), key=lambda i: scores[i], reverse=True)


pages = pdf_to_images("report.pdf", max_size=1024)
ranking = rerank_window("What is the company revenue?", pages[:20])
print("Best-first page indices:", ranking)

python

def rerank(query, images, window_size=20, stride=10):
    n = len(images)
    ws = min(window_size, n)
    st = min(stride, n)

    if n <= ws:
        return rerank_window(query, images)

    indices = list(range(n))
    cur = list(images)
    end, start = n, n - ws
    while end > 0 and start + st != 0:
        start = max(start, 0)
        ranked = rerank_window(query, cur[start:end])
        new_indices = [indices[start + p] for p in ranked]
        new_images = [cur[start + p] for p in ranked]
        for i, (idx, img) in enumerate(zip(new_indices, new_images)):
            indices[start + i] = idx
            cur[start + i] = img
        end -= st
        start -= st
    return indices


ranking = rerank("What is the company revenue?", pages)

bibtex

@article{sun2026very,
  title={Very Efficient Listwise Multimodal Reranking for Long Documents},
  author={Sun, Yiqun and Wei, Pengfei and Hsieh, Lawrence B},
  journal={arXiv preprint arXiv:2605.11864},
  year={2026}
}

ZipRerank

README

Model details

Installation

Quick start

Sliding window for long documents

Using your own images

How it works

Intended use and limitations

Citation

Explore FriendliAI today

README

Model details

Installation

Quick start

Sliding window for long documents

Using your own images

How it works

Intended use and limitations

Citation