Write the function you want to evaluate. Here we’ll build an intent classifier:
def classify_intent(datapoint): text = datapoint["inputs"]["text"] response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": f"""Classify this customer support message into ONE category:- billing: payment issues, invoices, charges, refunds- technical: bugs, errors, how to use features- account: login, password, profile, settings- general: other questions, feedbackReply with ONLY the category name.Message: {text}Category:"""}], temperature=0 ) return {"intent": response.choices[0].message.content.strip().lower()}
Define test cases with inputs and expected outputs:
dataset = [ { "inputs": {"text": "I was charged twice for my subscription this month."}, "ground_truth": {"intent": "billing"} }, { "inputs": {"text": "The export button isn't working. Getting error code 500."}, "ground_truth": {"intent": "technical"} }, { "inputs": {"text": "I forgot my password and the reset email never arrived."}, "ground_truth": {"intent": "account"} }, { "inputs": {"text": "Just wanted to say your support team was amazing. Thanks!"}, "ground_truth": {"intent": "general"} },]
from openai import OpenAIfrom honeyhive import evaluateclient = OpenAI()def classify_intent(datapoint): text = datapoint["inputs"]["text"] response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": f"""Classify this customer support message into ONE category:- billing: payment issues, invoices, charges, refunds- technical: bugs, errors, how to use features- account: login, password, profile, settings- general: other questions, feedbackReply with ONLY the category name.Message: {text}Category:"""}], temperature=0 ) return {"intent": response.choices[0].message.content.strip().lower()}dataset = [ {"inputs": {"text": "I was charged twice for my subscription this month."}, "ground_truth": {"intent": "billing"}}, {"inputs": {"text": "The export button isn't working. Getting error code 500."}, "ground_truth": {"intent": "technical"}}, {"inputs": {"text": "I forgot my password and the reset email never arrived."}, "ground_truth": {"intent": "account"}}, {"inputs": {"text": "Just wanted to say your support team was amazing. Thanks!"}, "ground_truth": {"intent": "general"}},]def intent_match(outputs, inputs, ground_truth): return 1.0 if outputs.get("intent", "").lower() == ground_truth.get("intent", "").lower() else 0.0result = evaluate( function=classify_intent, dataset=dataset, evaluators=[intent_match], name="intent-classifier-v1")