Dedicated Endpoints

Run this model inference on single tenant GPU with unmatched speed and reliability at scale.

Learn more
Container

Run this model inference with full control and performance in your environment.

Learn more

Get help setting up a custom Dedicated Endpoints.

Talk with our engineer to get a quote for reserved GPU instances with discounts.

README

License: apache-2.0

Prerequisites

1. Install dependencies

bash

pip install vllm openai pyyaml

2. Download the prompt template

The extraction prompt is shipped as python.yaml and javascript.yaml (for both JS and TS) in this repository. Clone or download it before running inference:

bash

# Clone the full repo
git clone https://huggingface.co/LatentForce-ai/Cassini-1.0
# Or download the prompt file only
wget https://huggingface.co/LatentForce-ai/Cassini-1.0/resolve/main/prompts/javascript.yaml

3. Start the vLLM server

In a separate terminal, serve the model:

bash

vllm serve LatentForce-ai/Cassini-1.0 --max-model-len 20480

The server will be available at http://127.0.0.1:8000 by default. Allow 1–2 minutes for the model to load before sending requests.


Minimal Inference

The following snippet runs inference on a single source file and prints the extracted JSON. Point SOURCE_FILE at any .py file, and run.

python

import json
import yaml
from openai import OpenAI
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
VLLM_URL = "http://127.0.0.1:8000/v1"
MODEL_ID = "LatentForce-ai/Cassini-1.0"
PROMPT_YAML = "python.yaml" # path to the prompt template from this repo
SOURCE_FILE = "main.py" # path to the source file to analyze
MAX_TOKENS = 6144
# ---------------------------------------------------------------------------
# Load prompt template
# ---------------------------------------------------------------------------
with open(PROMPT_YAML) as f:
config = yaml.safe_load(f)
prompt_template = config["prompt"]
# ---------------------------------------------------------------------------
# Prepare prompt
# ---------------------------------------------------------------------------
with open(SOURCE_FILE) as f:
source_code = f.read()
prompt = (
prompt_template
.replace("FILEPATH_PLACEHOLDER", SOURCE_FILE)
.replace("CONTENT_PLACEHOLDER", source_code)
)
# ---------------------------------------------------------------------------
# Run inference
# ---------------------------------------------------------------------------
client = OpenAI(base_url=VLLM_URL, api_key="no-key")
response = client.chat.completions.create(
model=MODEL_ID,
messages=[{"role": "user", "content": prompt}],
temperature=0,
max_tokens=MAX_TOKENS,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
# ---------------------------------------------------------------------------
# Parse output
# ---------------------------------------------------------------------------
raw = response.choices[0].message.content
def parse_json_response(text: str) -> dict | None:
"""Strip markdown fences if present and parse JSON."""
text = text.strip()
if text.startswith("```"):
first_newline = text.find("\n")
if first_newline != -1:
inner = text[first_newline + 1:]
close = inner.rfind("```")
if close != -1:
inner = inner[:close]
text = inner.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Fallback: find first complete JSON object
start, end = text.find("{"), text.rfind("}")
if start != -1 and end != -1 and end > start:
try:
return json.loads(text[start:end + 1])
except json.JSONDecodeError:
pass
return None
result = parse_json_response(raw)
if result is None:
print("Warning: model output could not be parsed as JSON.")
print("Raw output:", raw)
else:
print(json.dumps(result, indent=2))

Example Output

Suppose for a small Python file given below

python

"""
Declare and configure the signals for the impress core application
"""
from functools import partial
from django.core.cache import cache
from django.db import transaction
from django.db.models import signals
from django.dispatch import receiver
from core import models
from core.tasks.search import trigger_batch_document_indexer
from core.utils.users import get_users_sharing_documents_with_cache_key
@receiver(signals.post_save, sender=models.Document)
def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
Note : Within the transaction we can have an empty content and a serialization
error.
"""
transaction.on_commit(partial(trigger_batch_document_indexer, instance))
@receiver(signals.post_save, sender=models.DocumentAccess)
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
"""
Asynchronous call to the document indexer at the end of the transaction.
Clear cache for the affected user.
"""
if not created:
transaction.on_commit(
partial(trigger_batch_document_indexer, instance.document)
)
# Invalidate cache for the user
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)
@receiver(signals.post_delete, sender=models.DocumentAccess)
def document_access_post_delete(sender, instance, **kwargs): # pylint: disable=unused-argument
"""
Clear cache for the affected user when document access is deleted.
"""
if instance.user:
cache_key = get_users_sharing_documents_with_cache_key(instance.user)
cache.delete(cache_key)

Cassini-1.0 produces the following structured JSON

json

{
"imports": [
{
"module": "core",
"names": [
"models"
],
"alias": {}
},
{
"module": "core.tasks.search",
"names": [
"trigger_batch_document_indexer"
],
"alias": {}
},
{
"module": "core.utils.users",
"names": [
"get_users_sharing_documents_with_cache_key"
],
"alias": {}
}
],
"references": [],
"calls": [
{
"caller": "__module__",
"callee_text": "receiver",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 14
},
{
"caller": "document_post_save",
"callee_text": "on_commit",
"kind": "method",
"receiver": "transaction",
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 21
},
{
"caller": "document_post_save",
"callee_text": "partial",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 21
},
{
"caller": "document_post_save",
"callee_text": "trigger_batch_document_indexer",
"kind": "hook",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "core/tasks/search.py",
"line": 21
},
{
"caller": "__module__",
"callee_text": "receiver",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 24
},
{
"caller": "document_access_post_save",
"callee_text": "on_commit",
"kind": "method",
"receiver": "transaction",
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 31
},
{
"caller": "document_access_post_save",
"callee_text": "partial",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 32
},
{
"caller": "document_access_post_save",
"callee_text": "trigger_batch_document_indexer",
"kind": "hook",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "core/tasks/search.py",
"line": 32
},
{
"caller": "document_access_post_save",
"callee_text": "get_users_sharing_documents_with_cache_key",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "core/utils/users.py",
"line": 37
},
{
"caller": "document_access_post_save",
"callee_text": "delete",
"kind": "method",
"receiver": "cache",
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 38
},
{
"caller": "__module__",
"callee_text": "receiver",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 41
},
{
"caller": "document_access_post_delete",
"callee_text": "get_users_sharing_documents_with_cache_key",
"kind": "free",
"receiver": null,
"receiver_type_hint": null,
"callee_file_hint": "core/utils/users.py",
"line": 47
},
{
"caller": "document_access_post_delete",
"callee_text": "delete",
"kind": "method",
"receiver": "cache",
"receiver_type_hint": null,
"callee_file_hint": "external",
"line": 48
}
],
"type_assignments": [
{
"scope": "document_access_post_save",
"var": "cache_key",
"type": null,
"type_module": null,
"from_call": "get_users_sharing_documents_with_cache_key"
},
{
"scope": "document_access_post_delete",
"var": "cache_key",
"type": null,
"type_module": null,
"from_call": "get_users_sharing_documents_with_cache_key"
}
],
"definitions": [
"document_post_save",
"document_access_post_save",
"document_access_post_delete"
],
"definitions_rich": [
{
"name": "document_post_save",
"kind": "function",
"parent": null,
"bases": [],
"params": [
{
"name": "sender",
"type": null
},
{
"name": "instance",
"type": null
}
],
"returns": null,
"line": 15
},
{
"name": "document_access_post_save",
"kind": "function",
"parent": null,
"bases": [],
"params": [
{
"name": "sender",
"type": null
},
{
"name": "instance",
"type": null
},
{
"name": "created",
"type": null
}
],
"returns": null,
"line": 25
},
{
"name": "document_access_post_delete",
"kind": "function",
"parent": null,
"bases": [],
"params": [
{
"name": "sender",
"type": null
},
{
"name": "instance",
"type": null
}
],
"returns": null,
"line": 42
}
]
}

Model provider

LatentForce-ai

Model tree

Base

Qwen/Qwen3-4B

Fine-tuned

this model

Modalities

Input

Text

Output

Text

Pricing

Dedicated Endpoints

View details

Supported Functionality

Model APIs

Dedicated Endpoints

Container

More information

Explore FriendliAI today