FarmerlineML

dagbani-asr-qwen2audio-lora

Deploy Dedicated

Dedicated Endpoints

Run this model inference on single tenant GPU with unmatched speed and reliability at scale.

Learn more

Get help setting up a custom Dedicated Endpoints.

Talk with our engineer to get a quote for reserved GPU instances with discounts.

README

License: apache-2.0

Inference

python
import librosa, torch
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
from peft import PeftModel

BASE   = "Qwen/Qwen2-Audio-7B-Instruct"
REPO   = "FarmerlineML/dagbani-asr-qwen2audio-lora"
SYSTEM = (
    "You are a Dagbani speech recognition system. "
    "Transcribe the audio exactly as spoken in Dagbani. "
    "Return only the Dagbani transcript, nothing else."
)

processor = AutoProcessor.from_pretrained(REPO)
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    BASE, torch_dtype=torch.bfloat16, device_map="cuda", attn_implementation="sdpa"
)
model = PeftModel.from_pretrained(model, REPO)
model = model.merge_and_unload()
model.eval()

def transcribe(audio_path: str) -> str:
    audio, _ = librosa.load(audio_path, sr=processor.feature_extractor.sampling_rate)
    conversation = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": audio_path},
            {"type": "text",  "text": "Transcribe this Dagbani audio exactly."},
        ]},
    ]
    text   = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    inputs = processor(
        text=text, audio=audio,
        sampling_rate=processor.feature_extractor.sampling_rate,
        return_tensors="pt"
    ).to("cuda")
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=256, do_sample=False)
    n = inputs["input_ids"].shape[1]
    return processor.batch_decode(out[:, n:], skip_special_tokens=True)[0].strip()

print(transcribe("dagbani_audio.wav"))