import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_path = "Neura-Tech-AI/Neuron-Distill-Qwen2.5-3B-Instruct"
print("🎯 Initializing Project Neuron Evaluation Suite...")
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to("cuda")
eval_prompts = [
"Tell me about Project Neuron in short. What is its scale?",
"Explain quantum computing in simple Hindi lyrics.",
"Write a secure python API routing block for model inference."
]
def run_performance_test(prompt):
messages = [
{"role": "system", "content": "You are Neuron, an advanced AI system developed by Neura Tech AI."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt").to("cuda")
input_len = inputs.input_ids.shape[1]
start_time = time.time()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=150,
temperature=0.1,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)
latency = time.time() - start_time
generated_tokens = outputs[0][input_len:]
token_count = len(generated_tokens)
tokens_per_second = token_count / latency
response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
return latency, tokens_per_second, response
print("\n--- Running Quantitative Evaluation Matrix ---")
for i, prompt in enumerate(eval_prompts, 1):
lat, tps, resp = run_performance_test(prompt)
print(f"\n📊 Test Case #{i}: '{prompt}'")
print(f"⏱️ Latency: {lat:.2f}s | ⚡ Speed: {tps:.2f} tokens/sec")
print(f"🤖 Output:\n{resp}\n" + "-"*40)