Run this model inference on single tenant GPU with unmatched speed and reliability at scale.
Run this model inference with full control and performance in your environment.
Get help setting up a custom Dedicated Endpoints.
Talk with our engineer to get a quote for reserved GPU instances with discounts.
README
License: apache-2.0Prerequisites
1. Install dependencies
bash
pip install vllm openai pyyaml
2. Download the prompt template
The extraction prompt is shipped as python.yaml and javascript.yaml (for both JS and TS) in this repository. Clone or download it before running inference:
bash
# Clone the full repogit clone https://huggingface.co/LatentForce-ai/Cassini-1# Or download the prompt file onlywget https://huggingface.co/LatentForce-ai/Cassini-1/resolve/main/python.yaml
3. Start the vLLM server
In a separate terminal, serve the model:
bash
vllm serve LatentForce-ai/Cassini-1 --max-model-len 20480
The server will be available at
http://127.0.0.1:8000by default. Allow 1–2 minutes for the model to load before sending requests.
Minimal Inference
The following snippet runs inference on a single source file and prints the extracted JSON. Point SOURCE_FILE at any .py file, and run.
python
import jsonimport yamlfrom openai import OpenAI# ---------------------------------------------------------------------------# Config# ---------------------------------------------------------------------------VLLM_URL = "http://127.0.0.1:8000/v1"MODEL_ID = "LatentForce-ai/Cassini-1"PROMPT_YAML = "python.yaml" # path to the prompt template from this repoSOURCE_FILE = "main.py" # path to the source file to analyzeMAX_TOKENS = 6144# ---------------------------------------------------------------------------# Load prompt template# ---------------------------------------------------------------------------with open(PROMPT_YAML) as f:config = yaml.safe_load(f)prompt_template = config["prompt"]# ---------------------------------------------------------------------------# Prepare prompt# ---------------------------------------------------------------------------with open(SOURCE_FILE) as f:source_code = f.read()prompt = (prompt_template.replace("FILEPATH_PLACEHOLDER", SOURCE_FILE).replace("CONTENT_PLACEHOLDER", source_code))# ---------------------------------------------------------------------------# Run inference# ---------------------------------------------------------------------------client = OpenAI(base_url=VLLM_URL, api_key="no-key")response = client.chat.completions.create(model=MODEL_ID,messages=[{"role": "user", "content": prompt}],temperature=0,max_tokens=MAX_TOKENS,extra_body={"chat_template_kwargs": {"enable_thinking": False}},)# ---------------------------------------------------------------------------# Parse output# ---------------------------------------------------------------------------raw = response.choices[0].message.contentdef parse_json_response(text: str) -> dict | None:"""Strip markdown fences if present and parse JSON."""text = text.strip()if text.startswith("```"):first_newline = text.find("\n")if first_newline != -1:inner = text[first_newline + 1:]close = inner.rfind("```")if close != -1:inner = inner[:close]text = inner.strip()try:return json.loads(text)except json.JSONDecodeError:pass# Fallback: find first complete JSON objectstart, end = text.find("{"), text.rfind("}")if start != -1 and end != -1 and end > start:try:return json.loads(text[start:end + 1])except json.JSONDecodeError:passreturn Noneresult = parse_json_response(raw)if result is None:print("Warning: model output could not be parsed as JSON.")print("Raw output:", raw)else:print(json.dumps(result, indent=2))
Example Output
Suppose for a small Python file given below
python
"""Declare and configure the signals for the impress core application"""from functools import partialfrom django.core.cache import cachefrom django.db import transactionfrom django.db.models import signalsfrom django.dispatch import receiverfrom core import modelsfrom core.tasks.search import trigger_batch_document_indexerfrom core.utils.users import get_users_sharing_documents_with_cache_key@receiver(signals.post_save, sender=models.Document)def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument"""Asynchronous call to the document indexer at the end of the transaction.Note : Within the transaction we can have an empty content and a serializationerror."""transaction.on_commit(partial(trigger_batch_document_indexer, instance))@receiver(signals.post_save, sender=models.DocumentAccess)def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument"""Asynchronous call to the document indexer at the end of the transaction.Clear cache for the affected user."""if not created:transaction.on_commit(partial(trigger_batch_document_indexer, instance.document))# Invalidate cache for the userif instance.user:cache_key = get_users_sharing_documents_with_cache_key(instance.user)cache.delete(cache_key)@receiver(signals.post_delete, sender=models.DocumentAccess)def document_access_post_delete(sender, instance, **kwargs): # pylint: disable=unused-argument"""Clear cache for the affected user when document access is deleted."""if instance.user:cache_key = get_users_sharing_documents_with_cache_key(instance.user)cache.delete(cache_key)
Cassini-1 produces the following structured JSON
json
{"imports": [{"module": "core","names": ["models"],"alias": {}},{"module": "core.tasks.search","names": ["trigger_batch_document_indexer"],"alias": {}},{"module": "core.utils.users","names": ["get_users_sharing_documents_with_cache_key"],"alias": {}}],"references": [],"calls": [{"caller": "__module__","callee_text": "receiver","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "external","line": 14},{"caller": "document_post_save","callee_text": "on_commit","kind": "method","receiver": "transaction","receiver_type_hint": null,"callee_file_hint": "external","line": 21},{"caller": "document_post_save","callee_text": "partial","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "external","line": 21},{"caller": "document_post_save","callee_text": "trigger_batch_document_indexer","kind": "hook","receiver": null,"receiver_type_hint": null,"callee_file_hint": "core/tasks/search.py","line": 21},{"caller": "__module__","callee_text": "receiver","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "external","line": 24},{"caller": "document_access_post_save","callee_text": "on_commit","kind": "method","receiver": "transaction","receiver_type_hint": null,"callee_file_hint": "external","line": 31},{"caller": "document_access_post_save","callee_text": "partial","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "external","line": 32},{"caller": "document_access_post_save","callee_text": "trigger_batch_document_indexer","kind": "hook","receiver": null,"receiver_type_hint": null,"callee_file_hint": "core/tasks/search.py","line": 32},{"caller": "document_access_post_save","callee_text": "get_users_sharing_documents_with_cache_key","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "core/utils/users.py","line": 37},{"caller": "document_access_post_save","callee_text": "delete","kind": "method","receiver": "cache","receiver_type_hint": null,"callee_file_hint": "external","line": 38},{"caller": "__module__","callee_text": "receiver","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "external","line": 41},{"caller": "document_access_post_delete","callee_text": "get_users_sharing_documents_with_cache_key","kind": "free","receiver": null,"receiver_type_hint": null,"callee_file_hint": "core/utils/users.py","line": 47},{"caller": "document_access_post_delete","callee_text": "delete","kind": "method","receiver": "cache","receiver_type_hint": null,"callee_file_hint": "external","line": 48}],"type_assignments": [{"scope": "document_access_post_save","var": "cache_key","type": null,"type_module": null,"from_call": "get_users_sharing_documents_with_cache_key"},{"scope": "document_access_post_delete","var": "cache_key","type": null,"type_module": null,"from_call": "get_users_sharing_documents_with_cache_key"}],"definitions": ["document_post_save","document_access_post_save","document_access_post_delete"],"definitions_rich": [{"name": "document_post_save","kind": "function","parent": null,"bases": [],"params": [{"name": "sender","type": null},{"name": "instance","type": null}],"returns": null,"line": 15},{"name": "document_access_post_save","kind": "function","parent": null,"bases": [],"params": [{"name": "sender","type": null},{"name": "instance","type": null},{"name": "created","type": null}],"returns": null,"line": 25},{"name": "document_access_post_delete","kind": "function","parent": null,"bases": [],"params": [{"name": "sender","type": null},{"name": "instance","type": null}],"returns": null,"line": 42}]}
Model provider
LatentForce-ai
Model tree
Base
Qwen/Qwen3-4B
Fine-tuned
this model
Modalities
Input
Text
Output
Text
Pricing
Dedicated Endpoints
View detailsSupported Functionality
Model APIs
Dedicated Endpoints
Container
More information