From b4d6082ccd0d52e9a59e5ae4efd3f47521127e44 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Thu, 24 Oct 2024 17:46:14 -0400 Subject: [PATCH 01/36] Add main classes for experiments sdk --- ddtrace/llmobs/experiments.py | 238 ++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 ddtrace/llmobs/experiments.py diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py new file mode 100644 index 00000000000..c51acb39ae5 --- /dev/null +++ b/ddtrace/llmobs/experiments.py @@ -0,0 +1,238 @@ +from ddtrace import config +from typing import List, Dict, Any, Callable, Union +import time +import sys + +class Dataset: + def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: + self.name = name + self.data = data + self.description = description + self._validate_data() + + def __iter__(self) -> iter: + return iter(self.data) + + def __len__(self) -> int: + return len(self.data) + + def __getitem__(self, index: int) -> Dict[str, Any]: + return self.data[index] + + def __repr__(self) -> str: + header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\n" + separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + + def format_dict(d: Dict[str, Any]) -> List[str]: + def truncate(value: str) -> str: + return (value[:17] + '...') if len(value) > 20 else value + + return [f"{key}: {truncate(str(value))}" for key, value in d.items()] + + def format_entries(entries): + formatted_rows = [] + for i, entry in entries: + input_lines = format_dict(entry['input']) + expected_output_lines = format_dict(entry.get('expected_output', {})) + + # Determine the maximum number of lines in input and expected_output + max_lines = max(len(input_lines), len(expected_output_lines)) + + # Pad the lists to have the same number of lines + input_lines += [''] * (max_lines - len(input_lines)) + expected_output_lines += [''] * (max_lines - len(expected_output_lines)) + + for j in range(max_lines): + if j == 0: + index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |" + else: + index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |" + formatted_rows.append(index) + formatted_rows.append(separator) + return "\n".join(formatted_rows) + + if len(self.data) <= 4: + entries = format_entries(enumerate(self.data)) + else: + first_two = format_entries(enumerate(self.data[:2])) + last_two = format_entries(enumerate(self.data[-2:], start=len(self.data) - 2)) + entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" + + table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}" + return f"{header}\n{table if entries else 'No entries available.'}\n\n" + + def _validate_data(self) -> None: + if not self.data: + raise ValueError("Data cannot be empty.") + + if not all(isinstance(row, dict) for row in self.data): + raise ValueError("All rows must be dictionaries.") + + first_row_keys = set(self.data[0].keys()) + for row in self.data: + if set(row.keys()) != first_row_keys: + raise ValueError("All rows must have the same keys.") + + # Check that 'input' and 'expected_output' are flat dictionaries + for key in ['input', 'expected_output']: + if key in row and any(isinstance(value, dict) for value in row[key].values()): + raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).") + + @classmethod + def from_datadog(cls, name: str) -> 'Dataset': + # TODO: Implement this + pass + + def push(self) -> None: + # TODO: Implement this + print(config._dd_api_key) + pass + + +class Experiment: + def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable]) -> None: + self.name = name + self.task = task + self.dataset = dataset + self.evaluators = evaluators + + def __repr__(self) -> str: + separator = "+" + "-"*20 + "+" + "-"*50 + "+" + + def format_evaluator(evaluator: Callable) -> str: + return f"{evaluator.__name__}" + + evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators] + evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" + + table = ( + f"{separator}\n" + f"| {'Experiment':<18} | {self.name:<48} |\n" + f"{separator}\n" + f"| {'Task':<18} | {self.task.__name__:<48} |\n" + f"| {'Dataset':<18} | {f'{self.dataset.name} (n={len(self.dataset)})':<48} |\n" + f"| {'Evaluators':<18} | {evaluators:<48} |\n" + f"{separator}" + ) + return table + + def _validate_tasks(self) -> None: + # TODO: Implement this + pass + + def _validate_evaluators(self) -> None: + # TODO: Implement this + pass + + def run(self) -> 'ExperimentResults': + results = ExperimentResults(self.dataset) + total_rows = len(self.dataset) + + for idx, row in enumerate(self.dataset, 0): + # Apply the task function to the row + start_time = time.time() + output = self.task(row) + end_time = time.time() + duration = end_time - start_time + + # Store the results + results.experiment_rows.append({ + "output": output, + "evaluations": [], + "metadata": { + "duration": duration, + "timestamp": start_time + } + }) + + def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]: + return {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} + + results.experiment_rows[idx]["evaluations"] = _evaluate_row(row, output) + + # Update progress + progress = int(50 * idx / total_rows) # Progress bar length of 50 + bar = '=' * progress + ' ' * (50 - progress) + percent = int(100 * idx / total_rows) + sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({idx}/{total_rows})') + sys.stdout.flush() + + # Print a new line after completion + sys.stdout.write('\n') + + return results + + +class ExperimentResults: + def __init__(self, dataset: Dataset) -> None: + self.dataset = dataset + self.experiment_rows = [] + + def __repr__(self) -> str: + separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + + def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: + if isinstance(d, dict): + def truncate(value: str) -> str: + return (value[:17] + '...') if len(value) > 20 else value + + return [f"{key}: {truncate(str(value))}" for key, value in d.items()] + elif isinstance(d, list): + return [str(item) for item in d] + else: + return [str(d)] + + def format_entries(entries): + formatted_rows = [] + for i, entry in enumerate(entries): + dataset_entry = self.dataset[i] + input_lines = format_dict(dataset_entry['input']) + expected_output_lines = format_dict(dataset_entry.get('expected_output', {})) + output_lines = format_dict(entry['output']) + evaluations_lines = format_dict(entry.get('evaluations', [])) + + # Determine the maximum number of lines across all fields + max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines)) + + # Pad the lists to have the same number of lines + input_lines += [''] * (max_lines - len(input_lines)) + expected_output_lines += [''] * (max_lines - len(expected_output_lines)) + output_lines += [''] * (max_lines - len(output_lines)) + evaluations_lines += [''] * (max_lines - len(evaluations_lines)) + + for j in range(max_lines): + if j == 0: + index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" + else: + index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" + formatted_rows.append(index) + formatted_rows.append(separator) + return "\n".join(formatted_rows) + + if len(self.experiment_rows) <= 4: + entries = format_entries(self.experiment_rows) + else: + first_two = format_entries(self.experiment_rows[:2]) + last_two = format_entries(self.experiment_rows[-2:]) + entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" + + table = ( + f"{separator}\n" + f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n" + f"{separator}\n" + f"{entries}" + ) + return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" + + def __iter__(self) -> iter: + return iter(self.experiment_rows) + + def __len__(self) -> int: + return len(self.experiment_rows) + + def __getitem__(self, index: int) -> Any: + return self.experiment_rows[index] + + def push(self) -> None: + # TODO: Implement this + pass From f9e929629cb9c0e401112f70c1139bd6f1ec6827 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Thu, 24 Oct 2024 18:14:29 -0400 Subject: [PATCH 02/36] Added more things but don't remember what --- ddtrace/llmobs/experiments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py index c51acb39ae5..798854ed562 100644 --- a/ddtrace/llmobs/experiments.py +++ b/ddtrace/llmobs/experiments.py @@ -117,11 +117,11 @@ def format_evaluator(evaluator: Callable) -> str: return table def _validate_tasks(self) -> None: - # TODO: Implement this + # TODO: Design and implement this pass def _validate_evaluators(self) -> None: - # TODO: Implement this + # TODO: Design and implement this pass def run(self) -> 'ExperimentResults': From 60f3ba5ad488e1357fbf0e5278d29b84514bf2d5 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 28 Oct 2024 17:28:57 -0400 Subject: [PATCH 03/36] Add network calls for main methods --- ddtrace/llmobs/experiments.py | 456 ++++++++++++++++++++++++++++++++-- 1 file changed, 434 insertions(+), 22 deletions(-) diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py index 798854ed562..8d43d47415c 100644 --- a/ddtrace/llmobs/experiments.py +++ b/ddtrace/llmobs/experiments.py @@ -1,14 +1,28 @@ -from ddtrace import config -from typing import List, Dict, Any, Callable, Union -import time +from datetime import datetime +from http.client import HTTPSConnection +import hashlib +import json +import os +from typing import Any, Callable, Dict, List, Union import sys +import time +from urllib.parse import quote + +# Constants +BASE_URL = "api.datadoghq.com" +PROJECT_NAME = "sdk-testing" + class Dataset: def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: self.name = name + self._validate_data(data) self.data = data self.description = description - self._validate_data() + + # Post-push attributes + self.datadog_dataset_id = None + def __iter__(self) -> iter: return iter(self.data) @@ -18,9 +32,10 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Dict[str, Any]: return self.data[index] + def __repr__(self) -> str: - header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\n" + header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n" separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" def format_dict(d: Dict[str, Any]) -> List[str]: @@ -29,7 +44,7 @@ def truncate(value: str) -> str: return [f"{key}: {truncate(str(value))}" for key, value in d.items()] - def format_entries(entries): + def format_entries(entries): # Fixed indentation - this was nested too deeply formatted_rows = [] for i, entry in entries: input_lines = format_dict(entry['input']) @@ -61,15 +76,15 @@ def format_entries(entries): table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}" return f"{header}\n{table if entries else 'No entries available.'}\n\n" - def _validate_data(self) -> None: - if not self.data: + def _validate_data(self, data: List[Dict[str, Any]]) -> None: + if not data: raise ValueError("Data cannot be empty.") - if not all(isinstance(row, dict) for row in self.data): + if not all(isinstance(row, dict) for row in data): raise ValueError("All rows must be dictionaries.") - first_row_keys = set(self.data[0].keys()) - for row in self.data: + first_row_keys = set(data[0].keys()) + for row in data: if set(row.keys()) != first_row_keys: raise ValueError("All rows must have the same keys.") @@ -80,13 +95,156 @@ def _validate_data(self) -> None: @classmethod def from_datadog(cls, name: str) -> 'Dataset': - # TODO: Implement this - pass + """Create a dataset from a dataset hosted in Datadog. + + Args: + name: Name of the dataset to retrieve from Datadog + + Returns: + Dataset: A new Dataset instance populated with the records from Datadog + + Raises: + ValueError: If the dataset is not found + Exception: If there are HTTP errors during the request + """ + conn = HTTPSConnection(BASE_URL) + headers = { + "DD-API-KEY": os.getenv("DD_API_KEY"), + "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), + "Content-Type": "application/json" + } + + try: + # Get dataset ID + encoded_name = quote(name) + url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" + response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") + datasets = response_data.get('data', []) + + if not datasets: + raise ValueError(f"Dataset '{name}' not found") + + dataset_id = datasets[0]['id'] + + # Get dataset records + url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + records_data = _make_request(conn, headers, "GET", url, context="Records lookup") + + # Transform records into the expected format + class_records = [] + for record in records_data.get('data', []): + attrs = record.get('attributes', {}) + class_records.append({ + "input": attrs.get('input', {}), + "expected_output": attrs.get('expected_output', {}), + **attrs.get('metadata', {}) + }) + + # Create new dataset instance + dataset = cls(name, class_records) + dataset.datadog_dataset_id = dataset_id + return dataset + + finally: + conn.close() + + def push(self) -> Dict[str, str]: + """Push the dataset to Datadog. + + Returns: + Dict[str, str]: Dictionary containing dataset information including: + - dataset_id: The ID of the created/updated dataset + - dataset_name: The name of the dataset + - record_count: Number of records uploaded + """ + # Initialize connection and headers + conn = HTTPSConnection(BASE_URL) + headers = { + "DD-API-KEY": os.getenv("DD_API_KEY"), + "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), + "Content-Type": "application/json" + } + + try: + # Check if dataset exists + encoded_name = quote(self.name) + url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" + response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") + datasets = response_data.get('data', []) + + if not datasets: + # Create new dataset + print(f"Dataset '{self.name}' not found. Creating it.") + dataset_payload = { + "data": { + "type": "datasets", + "attributes": { + "name": self.name, + "description": self.description or f"Dataset used for {self.name}", + "metadata": {"team": "ml-obs"} + } + } + } + response_data = _make_request( + conn, + headers, + "POST", + "/api/unstable/llm-obs/v1/datasets", + body=json.dumps(dataset_payload), + context="Dataset creation" + ) + dataset_id = response_data['data']['id'] + self.datadog_dataset_id = dataset_id + else: + # Dataset exists, create a new version + dataset_id = datasets[0]['id'] + version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + new_dataset_name = f"{self.name}-{version_suffix}" + print(f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'.") + dataset_payload = { + "data": { + "type": "datasets", + "attributes": { + "name": new_dataset_name, + "description": f"Dataset versioned on {version_suffix} used for {self.name}", + "metadata": {"team": "ml-obs"} + } + } + } + response_data = _make_request( + conn, + headers, + "POST", + "/api/unstable/llm-obs/v1/datasets", + body=json.dumps(dataset_payload), + context="Dataset version creation" + ) + dataset_id = response_data['data']['id'] + self.datadog_dataset_id = dataset_id + self.name = new_dataset_name + + # Add records to the dataset + records_payload = { + "data": { + "type": "datasets", + "attributes": { + "records": self.data + } + } + } + url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + _make_request(conn, headers, "POST", url, body=json.dumps(records_payload), context="Adding records") + + print(f"✓ Successfully uploaded dataset '{self.name}'") + print(f" • Dataset ID: {dataset_id}") + print(f" • Records uploaded: {len(self.data)}") + + return self + + finally: + conn.close() + - def push(self) -> None: - # TODO: Implement this - print(config._dd_api_key) - pass class Experiment: @@ -95,6 +253,12 @@ def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List self.task = task self.dataset = dataset self.evaluators = evaluators + self.tags = [] + + # Post-run attributes + self.has_run = False + self.results = None + def __repr__(self) -> str: separator = "+" + "-"*20 + "+" + "-"*50 + "+" @@ -139,10 +303,17 @@ def run(self) -> 'ExperimentResults': results.experiment_rows.append({ "output": output, "evaluations": [], + "metadata": { + "timestamp": start_time, "duration": duration, - "timestamp": start_time - } + "dataset_record_idx": idx, + "project_name": PROJECT_NAME, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "tags": self.tags, + "error": None }) def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]: @@ -160,7 +331,217 @@ def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any] # Print a new line after completion sys.stdout.write('\n') + self.has_run = True + self.results = results return results + + def get_results(self) -> 'ExperimentResults': + if not self.has_run: + raise ValueError("Experiment has not been run yet") + return self.results + + def push(self) -> Dict[str, str]: + """Push the experiment results to Datadog. + + Returns: + Dict[str, str]: Dictionary containing experiment information including: + - experiment_id: The ID of the created experiment + - experiment_name: The name of the experiment + - span_count: Number of spans uploaded + """ + if not self.has_run: + raise ValueError("Experiment has not been run yet") + + # Initialize connection and headers + conn = HTTPSConnection(BASE_URL) + headers = { + "DD-API-KEY": os.getenv("DD_API_KEY"), + "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), + "Content-Type": "application/json" + } + + try: + # Check if project exists + url = f"/api/unstable/llm-obs/v1/projects?filter[name]={PROJECT_NAME}" + response_data = _make_request(conn, headers, "GET", url, context="Project lookup") + projects = response_data.get('data', []) + + if not projects: + # Create new project + print(f"Project '{PROJECT_NAME}' not found. Creating it.") + project_payload = { + "data": { + "type": "projects", + "attributes": { + "name": PROJECT_NAME, + "description": f"Project for {PROJECT_NAME}", + "metadata": {"team": "ml-obs"} + } + } + } + response_data = _make_request( + conn, + headers, + "POST", + "/api/unstable/llm-obs/v1/projects", + body=json.dumps(project_payload), + context="Project creation" + ) + project_id = response_data['data']['id'] + else: + project_id = projects[0]['id'] + + # Check if experiment exists + encoded_name = quote(self.name) + url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" + response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup") + experiments = response_data.get('data', []) + + if not experiments: + # Create new experiment + print(f"Experiment '{self.name}' not found. Creating it.") + experiment_payload = { + "data": { + "type": "experiments", + "attributes": { + "name": self.name, + "description": f"Experiment: {self.name} on dataset: {self.dataset.name}", + "dataset_id": self.dataset.datadog_dataset_id, + "project_id": project_id, + "metadata": { + "tags": self.tags, + "team": "ml-obs" + } + } + } + } + response_data = _make_request( + conn, + headers, + "POST", + "/api/unstable/llm-obs/v1/experiments", + body=json.dumps(experiment_payload), + context="Experiment creation" + ) + experiment_id = response_data['data']['id'] + else: + # Experiment exists, create a new version + version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + new_experiment_name = f"{self.name}-{version_suffix}" + print(f"Experiment '{self.name}' found. Creating new version '{new_experiment_name}'.") + experiment_payload = { + "data": { + "type": "experiments", + "attributes": { + "name": new_experiment_name, + "description": f"Experiment versioned on {version_suffix} used for {self.name}", + "dataset_id": self.dataset.datadog_dataset_id, + "project_id": project_id, + "metadata": { + "tags": self.tags, + "team": "ml-obs" + } + } + } + } + response_data = _make_request( + conn, + headers, + "POST", + "/api/unstable/llm-obs/v1/experiments", + body=json.dumps(experiment_payload), + context="Experiment version creation" + ) + experiment_id = response_data['data']['id'] + self.name = new_experiment_name + + # Prepare and send experiment results + spans = [] + metrics = [] + + + + for idx, result in enumerate(self.results): + + span = { + "span_id": _make_id(), + "project_id": project_id, + "experiment_id": experiment_id, + "dataset_id": self.dataset.datadog_dataset_id, + "dataset_record_id": _make_id(), + "start_ns": int(result['metadata']['timestamp'] * 1e9), + "duration": float(result['metadata']['duration'] * 1e9), + "tags": self.tags, + "status": "ok", + "meta": { + "span": {"kind": "experiment"}, + "input": self.dataset[idx]['input'], + "output": result['output'], + "expected_output": self.dataset[idx].get('expected_output', {}), + "error": { + "message": result['error'], + "stack": None, + "type": None + } + } + + } + spans.append(span) + + # Add evaluation metrics + for metric_name, metric_value in result['evaluations'].items(): + timestamp_ms = int(result['metadata']['timestamp'] * 1000) + + if isinstance(metric_value, bool): + metric_value = 1 if metric_value else 0 + metric_type = "score" + elif isinstance(metric_value, (int, float)): + metric_type = "score" + else: + metric_type = "categorical" + metric_value = str(metric_value) + + metric = { + "span_id": span['span_id'], + "metric_type": metric_type, + "timestamp_ms": timestamp_ms, + "label": metric_name, + "score_value" if metric_type == "score" else "categorical_value": metric_value + } + metrics.append(metric) + + results_payload = { + "data": { + "type": "experiments", + "attributes": { + "spans": spans, + "metrics": metrics + } + } + } + + + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" + _make_request( + conn, + headers, + "POST", + url, + body=json.dumps(results_payload), + context="Publishing results" + ) + + print(f"✓ Successfully uploaded experiment '{self.name}'") + print(f" • Experiment ID: {experiment_id}") + print(f" • Spans uploaded: {len(spans)}") + print(f" • Metrics uploaded: {len(metrics)}") + + return self + + finally: + conn.close() + + class ExperimentResults: @@ -233,6 +614,37 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Any: return self.experiment_rows[index] - def push(self) -> None: - # TODO: Implement this - pass + + +def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, url: str, body: Any = None, context: str = "") -> Dict[str, Any]: + if method == "GET": + conn.request(method, url, headers=headers) + else: + if body is not None and isinstance(body, str): + body = body.encode('utf-8') + conn.request(method, url, body=body, headers=headers) + + response = conn.getresponse() + response_body = response.read() + + if response.status >= 400: + error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}" + raise Exception(error_message) + + # Add handling for empty response + if not response_body: + return {} # Return empty dict for empty responses + + try: + return json.loads(response_body) + except json.JSONDecodeError: + error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}" + raise Exception(error_message) + +def _make_id() -> str: + return hashlib.sha256(datetime.now().isoformat().encode('utf-8')).hexdigest() + + + + + From d48942d0c554c6a487a56fa851dc95ff01c3ad08 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 10:41:33 -0400 Subject: [PATCH 04/36] Add docstring --- ddtrace/llmobs/experiments.py | 513 ++++++++++++++++++++++++---------- 1 file changed, 364 insertions(+), 149 deletions(-) diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py index 8d43d47415c..5781b1e9572 100644 --- a/ddtrace/llmobs/experiments.py +++ b/ddtrace/llmobs/experiments.py @@ -3,17 +3,49 @@ import hashlib import json import os -from typing import Any, Callable, Dict, List, Union +from typing import Any, Callable, Dict, List, Union, Optional, Iterator import sys import time from urllib.parse import quote +import concurrent.futures +import itertools +import uuid # Constants BASE_URL = "api.datadoghq.com" -PROJECT_NAME = "sdk-testing" + + +def _validate_api_keys() -> None: + """Validate that required Datadog API keys are set in environment variables. + + Raises: + ValueError: If any required API keys are missing from environment variables + """ + missing_keys = [] + for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]: + if not os.getenv(key): + missing_keys.append(key) + + if missing_keys: + raise ValueError( + f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. " + "Please set these environment variables before pushing to Datadog." + ) class Dataset: + """A container for LLM experiment data that can be pushed to and retrieved from Datadog. + + This class manages collections of input/output pairs used for LLM experiments, + with functionality to validate, push to Datadog, and retrieve from Datadog. + + Attributes: + name (str): Name of the dataset + data (List[Dict[str, Any]]): List of records containing input/output pairs + description (str): Optional description of the dataset + datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed) + """ + def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: self.name = name self._validate_data(data) @@ -24,7 +56,7 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") self.datadog_dataset_id = None - def __iter__(self) -> iter: + def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self.data) def __len__(self) -> int: @@ -77,9 +109,22 @@ def format_entries(entries): # Fixed indentation - this was nested too deeply return f"{header}\n{table if entries else 'No entries available.'}\n\n" def _validate_data(self, data: List[Dict[str, Any]]) -> None: + """Validate the format and structure of dataset records. + + Args: + data: List of dataset records to validate + + Raises: + ValueError: If data is empty, contains non-dictionary rows, + has inconsistent keys, contains nested dictionaries, + or exceeds 50,000 rows + """ if not data: raise ValueError("Data cannot be empty.") + if len(data) > 50000: + raise ValueError("Dataset cannot exceed 50,000 rows.") + if not all(isinstance(row, dict) for row in data): raise ValueError("All rows must be dictionaries.") @@ -107,6 +152,7 @@ def from_datadog(cls, name: str) -> 'Dataset': ValueError: If the dataset is not found Exception: If there are HTTP errors during the request """ + _validate_api_keys() conn = HTTPSConnection(BASE_URL) headers = { "DD-API-KEY": os.getenv("DD_API_KEY"), @@ -157,7 +203,7 @@ def push(self) -> Dict[str, str]: - dataset_name: The name of the dataset - record_count: Number of records uploaded """ - # Initialize connection and headers + _validate_api_keys() conn = HTTPSConnection(BASE_URL) headers = { "DD-API-KEY": os.getenv("DD_API_KEY"), @@ -248,13 +294,29 @@ def push(self) -> Dict[str, str]: class Experiment: - def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable]) -> None: + """Manages the execution and evaluation of LLM tasks on a dataset. + + This class handles running tasks against datasets, applying evaluators, + and collecting results for analysis. + + Attributes: + name (str): Name of the experiment + task (Callable): Function that processes each dataset record + dataset (Dataset): Dataset to run the experiment on + evaluators (List[Callable]): Functions that evaluate task outputs + tags (List[str]): Tags for organizing experiments + project_name (str): Name of the project this experiment belongs to + has_run (bool): Whether the experiment has been executed + results (ExperimentResults): Results after running the experiment + """ + + def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable], tags: List[str] = [], project_name: str = "-") -> None: self.name = name self.task = task self.dataset = dataset self.evaluators = evaluators self.tags = [] - + self.project_name = project_name # Post-run attributes self.has_run = False self.results = None @@ -288,45 +350,112 @@ def _validate_evaluators(self) -> None: # TODO: Design and implement this pass - def run(self) -> 'ExperimentResults': - results = ExperimentResults(self.dataset) + def _validate_tags(self) -> None: + """Validate experiment tags format. + + Raises: + ValueError: If any tag doesn't follow the 'key:value' format + """ + for tag in self.tags: + if not isinstance(tag, str) or ":" not in tag: + raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.") + + def run(self, _jobs: int = 10) -> 'ExperimentResults': + """Execute the experiment on the dataset. + + Runs the task function on each dataset record in parallel and collects + results and evaluations. + + Args: + _jobs (int, optional): Number of parallel workers. Defaults to 10. + Must be between 1 and 20. + + Returns: + ExperimentResults: Object containing the experiment results + + Raises: + ValueError: If _jobs is not between 1 and 20 + """ + if not 1 <= _jobs <= 20: + raise ValueError("Number of jobs must be between 1 and 20") + + results = ExperimentResults(self.dataset, self) total_rows = len(self.dataset) - for idx, row in enumerate(self.dataset, 0): - # Apply the task function to the row - start_time = time.time() - output = self.task(row) - end_time = time.time() - duration = end_time - start_time - - # Store the results - results.experiment_rows.append({ - "output": output, - "evaluations": [], - - "metadata": { - "timestamp": start_time, - "duration": duration, - "dataset_record_idx": idx, - "project_name": PROJECT_NAME, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "tags": self.tags, - "error": None - }) - - def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]: - return {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} - - results.experiment_rows[idx]["evaluations"] = _evaluate_row(row, output) - - # Update progress - progress = int(50 * idx / total_rows) # Progress bar length of 50 - bar = '=' * progress + ' ' * (50 - progress) - percent = int(100 * idx / total_rows) - sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({idx}/{total_rows})') - sys.stdout.flush() + def process_row(idx_row): + idx, row = idx_row + try: + # Apply the task function to the row + start_time = time.time() + output = self.task(row) + end_time = time.time() + duration = end_time - start_time + + # Evaluate the output + evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} + + # Store the result + return { + "idx": idx, + "result": { + "output": output, + "evaluations": evaluations, + "metadata": { + "timestamp": start_time, + "duration": duration, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "tags": self.tags, + "error": None + } + } + except Exception as e: + # Handle exceptions and store the error + return { + "idx": idx, + "result": { + "output": None, + "evaluations": {}, + "metadata": { + "timestamp": time.time(), + "duration": 0, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "tags": self.tags, + "error": str(e) + } + } + + with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: + # Create futures list first + future_to_idx = { + executor.submit(process_row, (idx, row)): idx + for idx, row in enumerate(self.dataset) + } + + # Process as they complete while maintaining order + completed = 0 + results_buffer = [None] * total_rows + for future in concurrent.futures.as_completed(future_to_idx): + idx = future_to_idx[future] + results_buffer[idx] = future.result()['result'] + completed += 1 + + # Update progress + progress = int(50 * completed / total_rows) + bar = '=' * progress + ' ' * (50 - progress) + percent = int(100 * completed / total_rows) + sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})') + sys.stdout.flush() + + # Add results in correct order + results.experiment_rows = results_buffer # Print a new line after completion sys.stdout.write('\n') @@ -334,12 +463,98 @@ def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any] self.has_run = True self.results = results return results - - def get_results(self) -> 'ExperimentResults': + + def get_results(self) -> Union['ExperimentResults', List['ExperimentResults']]: if not self.has_run: raise ValueError("Experiment has not been run yet") return self.results + + + +class ExperimentResults: + """Contains and manages the results of an experiment run. + + Stores the outputs, evaluations, and metadata for each record processed + in an experiment, with functionality to analyze and push results to Datadog. + + Attributes: + dataset (Dataset): The dataset used in the experiment + experiment (Experiment): The experiment that generated these results + experiment_rows (List[Dict]): Results for each processed record + """ + + def __init__(self, dataset: Dataset, experiment: Experiment) -> None: + self.dataset = dataset + self.experiment = experiment + self.experiment_rows = [] + + def __repr__(self) -> str: + separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + + def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: + if isinstance(d, dict): + def truncate(value: str) -> str: + return (value[:17] + '...') if len(value) > 20 else value + + return [f"{key}: {truncate(str(value))}" for key, value in d.items()] + elif isinstance(d, list): + return [str(item) for item in d] + else: + return [str(d)] + + def format_entries(entries): + formatted_rows = [] + for i, entry in enumerate(entries): + dataset_idx = entry['metadata']['dataset_record_idx'] + dataset_entry = self.dataset[dataset_idx] + input_lines = format_dict(dataset_entry['input']) + expected_output_lines = format_dict(dataset_entry.get('expected_output', {})) + output_lines = format_dict(entry['output']) + evaluations_lines = format_dict(entry.get('evaluations', [])) + + # Determine the maximum number of lines across all fields + max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines)) + + # Pad the lists to have the same number of lines + input_lines += [''] * (max_lines - len(input_lines)) + expected_output_lines += [''] * (max_lines - len(expected_output_lines)) + output_lines += [''] * (max_lines - len(output_lines)) + evaluations_lines += [''] * (max_lines - len(evaluations_lines)) + + for j in range(max_lines): + if j == 0: + index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" + else: + index = f"|{'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" + formatted_rows.append(index) + formatted_rows.append(separator) + return "\n".join(formatted_rows) + + if len(self.experiment_rows) <= 4: + entries = format_entries(self.experiment_rows) + else: + first_two = format_entries(self.experiment_rows[:2]) + last_two = format_entries(self.experiment_rows[-2:]) + entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" + + table = ( + f"{separator}\n" + f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n" + f"{separator}\n" + f"{entries}" + ) + return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" + + def __iter__(self) -> Iterator[Dict[str, Any]]: + return iter(self.experiment_rows) + + def __len__(self) -> int: + return len(self.experiment_rows) + + def __getitem__(self, index: int) -> Any: + return self.experiment_rows[index] + def push(self) -> Dict[str, str]: """Push the experiment results to Datadog. @@ -349,9 +564,8 @@ def push(self) -> Dict[str, str]: - experiment_name: The name of the experiment - span_count: Number of spans uploaded """ - if not self.has_run: - raise ValueError("Experiment has not been run yet") - + _validate_api_keys() + # Initialize connection and headers conn = HTTPSConnection(BASE_URL) headers = { @@ -362,19 +576,19 @@ def push(self) -> Dict[str, str]: try: # Check if project exists - url = f"/api/unstable/llm-obs/v1/projects?filter[name]={PROJECT_NAME}" + url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" response_data = _make_request(conn, headers, "GET", url, context="Project lookup") projects = response_data.get('data', []) if not projects: # Create new project - print(f"Project '{PROJECT_NAME}' not found. Creating it.") + print(f"Project '{self.experiment.project_name}' not found. Creating it.") project_payload = { "data": { "type": "projects", "attributes": { - "name": PROJECT_NAME, - "description": f"Project for {PROJECT_NAME}", + "name": self.experiment.project_name, + "description": f"Project for {self.experiment.project_name}", "metadata": {"team": "ml-obs"} } } @@ -392,24 +606,24 @@ def push(self) -> Dict[str, str]: project_id = projects[0]['id'] # Check if experiment exists - encoded_name = quote(self.name) + encoded_name = quote(self.experiment.name) url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup") experiments = response_data.get('data', []) if not experiments: # Create new experiment - print(f"Experiment '{self.name}' not found. Creating it.") + print(f"Experiment '{self.experiment.name}' not found. Creating it.") experiment_payload = { "data": { "type": "experiments", "attributes": { - "name": self.name, - "description": f"Experiment: {self.name} on dataset: {self.dataset.name}", - "dataset_id": self.dataset.datadog_dataset_id, + "name": self.experiment.name, + "description": f"Experiment: {self.experiment.name} on dataset: {self.experiment.dataset.name}", + "dataset_id": self.experiment.dataset.datadog_dataset_id, "project_id": project_id, "metadata": { - "tags": self.tags, + "tags": self.experiment.tags, "team": "ml-obs" } } @@ -427,18 +641,18 @@ def push(self) -> Dict[str, str]: else: # Experiment exists, create a new version version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - new_experiment_name = f"{self.name}-{version_suffix}" - print(f"Experiment '{self.name}' found. Creating new version '{new_experiment_name}'.") + new_experiment_name = f"{self.experiment.name}-{version_suffix}" + print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.") experiment_payload = { "data": { "type": "experiments", "attributes": { "name": new_experiment_name, - "description": f"Experiment versioned on {version_suffix} used for {self.name}", - "dataset_id": self.dataset.datadog_dataset_id, + "description": f"Experiment versioned on {version_suffix} used for {self.experiment.name}", + "dataset_id": self.experiment.dataset.datadog_dataset_id, "project_id": project_id, "metadata": { - "tags": self.tags, + "tags": self.experiment.tags, "team": "ml-obs" } } @@ -453,7 +667,7 @@ def push(self) -> Dict[str, str]: context="Experiment version creation" ) experiment_id = response_data['data']['id'] - self.name = new_experiment_name + self.experiment.name = new_experiment_name # Prepare and send experiment results spans = [] @@ -461,27 +675,27 @@ def push(self) -> Dict[str, str]: - for idx, result in enumerate(self.results): + for idx, result in enumerate(self.experiment_rows): span = { "span_id": _make_id(), "project_id": project_id, "experiment_id": experiment_id, - "dataset_id": self.dataset.datadog_dataset_id, + "dataset_id": self.experiment.dataset.datadog_dataset_id, "dataset_record_id": _make_id(), "start_ns": int(result['metadata']['timestamp'] * 1e9), "duration": float(result['metadata']['duration'] * 1e9), - "tags": self.tags, + "tags": self.experiment.tags, "status": "ok", "meta": { "span": {"kind": "experiment"}, - "input": self.dataset[idx]['input'], + "input": self.experiment.dataset[idx]['input'], "output": result['output'], - "expected_output": self.dataset[idx].get('expected_output', {}), + "expected_output": self.experiment.dataset[idx].get('expected_output', {}), "error": { - "message": result['error'], - "stack": None, - "type": None + "message": result['error'], + "stack": None, + "type": None } } @@ -531,7 +745,7 @@ def push(self) -> Dict[str, str]: context="Publishing results" ) - print(f"✓ Successfully uploaded experiment '{self.name}'") + print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'") print(f" • Experiment ID: {experiment_id}") print(f" • Spans uploaded: {len(spans)}") print(f" • Metrics uploaded: {len(metrics)}") @@ -542,81 +756,33 @@ def push(self) -> Dict[str, str]: conn.close() - - -class ExperimentResults: - def __init__(self, dataset: Dataset) -> None: - self.dataset = dataset - self.experiment_rows = [] - - def __repr__(self) -> str: - separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" - - def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: - if isinstance(d, dict): - def truncate(value: str) -> str: - return (value[:17] + '...') if len(value) > 20 else value - - return [f"{key}: {truncate(str(value))}" for key, value in d.items()] - elif isinstance(d, list): - return [str(item) for item in d] - else: - return [str(d)] - - def format_entries(entries): - formatted_rows = [] - for i, entry in enumerate(entries): - dataset_entry = self.dataset[i] - input_lines = format_dict(dataset_entry['input']) - expected_output_lines = format_dict(dataset_entry.get('expected_output', {})) - output_lines = format_dict(entry['output']) - evaluations_lines = format_dict(entry.get('evaluations', [])) - - # Determine the maximum number of lines across all fields - max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines)) - - # Pad the lists to have the same number of lines - input_lines += [''] * (max_lines - len(input_lines)) - expected_output_lines += [''] * (max_lines - len(expected_output_lines)) - output_lines += [''] * (max_lines - len(output_lines)) - evaluations_lines += [''] * (max_lines - len(evaluations_lines)) - - for j in range(max_lines): - if j == 0: - index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" - else: - index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" - formatted_rows.append(index) - formatted_rows.append(separator) - return "\n".join(formatted_rows) - - if len(self.experiment_rows) <= 4: - entries = format_entries(self.experiment_rows) - else: - first_two = format_entries(self.experiment_rows[:2]) - last_two = format_entries(self.experiment_rows[-2:]) - entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" - - table = ( - f"{separator}\n" - f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n" - f"{separator}\n" - f"{entries}" - ) - return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" - - def __iter__(self) -> iter: - return iter(self.experiment_rows) - - def __len__(self) -> int: - return len(self.experiment_rows) - - def __getitem__(self, index: int) -> Any: - return self.experiment_rows[index] - - - -def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, url: str, body: Any = None, context: str = "") -> Dict[str, Any]: +def _make_request( + conn: HTTPSConnection, + headers: Dict[str, Any], + method: str, + url: str, + body: Optional[Any] = None, + context: str = "" +) -> Dict[str, Any]: + """Make an HTTP request to the Datadog API. + + Handles making HTTP requests to Datadog's API with proper error handling + and response parsing. + + Args: + conn: The HTTP connection to use + headers: Request headers + method: HTTP method (GET, POST, etc.) + url: Request URL + body: Request body (optional) + context: Context string for error messages (optional) + + Returns: + Dict[str, Any]: Parsed JSON response + + Raises: + Exception: If the request fails, returns an error status, or returns invalid JSON + """ if method == "GET": conn.request(method, url, headers=headers) else: @@ -642,7 +808,56 @@ def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, u raise Exception(error_message) def _make_id() -> str: - return hashlib.sha256(datetime.now().isoformat().encode('utf-8')).hexdigest() + """Generate a unique identifier. + + Returns: + str: A random UUID as a hexadecimal string + """ + return uuid.uuid4().hex + +def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: + """Decorator that creates multiple versions of a function with different parameter combinations. + + Creates multiple versions of a function by generating all possible combinations + of the provided parameters. Each generated function variant includes tags + indicating its parameter values. + + Args: + **param_dict: Dictionary of parameter names and their possible values. + Values can be single items or lists of possible values. + + Returns: + Callable: Decorator function that generates parameterized versions of the input function + + Example: + @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7]) + def my_function(text, model, temperature): + # This will create 4 versions of the function with different combinations + # of model and temperature parameters + pass + """ + def decorator(func): + # Generate all combinations of parameters + param_names = list(param_dict.keys()) + param_values = [param_dict[name] if isinstance(param_dict[name], (list, tuple)) + else [param_dict[name]] for name in param_names] + param_combinations = [dict(zip(param_names, combo)) + for combo in itertools.product(*param_values)] + + # Create a new function for each parameter combination + def create_parameterized_func(params): + def wrapped_func(*args, **kwargs): + return func(*args, **{**kwargs, **params}) + + # Create a descriptive name for the function + param_str = "-".join(f"{k}={v}" for k, v in params.items()) + wrapped_func.__name__ = f"{func.__name__}_{param_str}" + wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()] + return wrapped_func + + return [create_parameterized_func(combo) for combo in param_combinations] + + return decorator From 88f05d3a2c16768bd8f0a2722869f7baeaa20f37 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 10:48:36 -0400 Subject: [PATCH 05/36] Format code --- ddtrace/llmobs/experiments.py | 416 +++++++++++++++++++--------------- 1 file changed, 236 insertions(+), 180 deletions(-) diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py index 5781b1e9572..ce159c752c8 100644 --- a/ddtrace/llmobs/experiments.py +++ b/ddtrace/llmobs/experiments.py @@ -25,7 +25,7 @@ def _validate_api_keys() -> None: for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]: if not os.getenv(key): missing_keys.append(key) - + if missing_keys: raise ValueError( f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. " @@ -46,7 +46,9 @@ class Dataset: datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed) """ - def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: + def __init__( + self, name: str, data: List[Dict[str, Any]], description: str = "" + ) -> None: self.name = name self._validate_data(data) self.data = data @@ -54,7 +56,6 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") # Post-push attributes self.datadog_dataset_id = None - def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self.data) @@ -64,31 +65,30 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Dict[str, Any]: return self.data[index] - - + def __repr__(self) -> str: header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n" - separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + separator = "+" + "-" * 10 + "+" + "-" * 38 + "+" + "-" * 38 + "+" def format_dict(d: Dict[str, Any]) -> List[str]: def truncate(value: str) -> str: - return (value[:17] + '...') if len(value) > 20 else value + return (value[:17] + "...") if len(value) > 20 else value return [f"{key}: {truncate(str(value))}" for key, value in d.items()] - def format_entries(entries): # Fixed indentation - this was nested too deeply + def format_entries(entries): formatted_rows = [] for i, entry in entries: - input_lines = format_dict(entry['input']) - expected_output_lines = format_dict(entry.get('expected_output', {})) - + input_lines = format_dict(entry["input"]) + expected_output_lines = format_dict(entry.get("expected_output", {})) + # Determine the maximum number of lines in input and expected_output max_lines = max(len(input_lines), len(expected_output_lines)) - + # Pad the lists to have the same number of lines - input_lines += [''] * (max_lines - len(input_lines)) - expected_output_lines += [''] * (max_lines - len(expected_output_lines)) - + input_lines += [""] * (max_lines - len(input_lines)) + expected_output_lines += [""] * (max_lines - len(expected_output_lines)) + for j in range(max_lines): if j == 0: index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |" @@ -102,12 +102,14 @@ def format_entries(entries): # Fixed indentation - this was nested too deeply entries = format_entries(enumerate(self.data)) else: first_two = format_entries(enumerate(self.data[:2])) - last_two = format_entries(enumerate(self.data[-2:], start=len(self.data) - 2)) + last_two = format_entries( + enumerate(self.data[-2:], start=len(self.data) - 2) + ) entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}" return f"{header}\n{table if entries else 'No entries available.'}\n\n" - + def _validate_data(self, data: List[Dict[str, Any]]) -> None: """Validate the format and structure of dataset records. @@ -132,14 +134,18 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None: for row in data: if set(row.keys()) != first_row_keys: raise ValueError("All rows must have the same keys.") - + # Check that 'input' and 'expected_output' are flat dictionaries - for key in ['input', 'expected_output']: - if key in row and any(isinstance(value, dict) for value in row[key].values()): - raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).") + for key in ["input", "expected_output"]: + if key in row and any( + isinstance(value, dict) for value in row[key].values() + ): + raise ValueError( + f"'{key}' must be a flat dictionary (no nested dictionaries)." + ) @classmethod - def from_datadog(cls, name: str) -> 'Dataset': + def from_datadog(cls, name: str) -> "Dataset": """Create a dataset from a dataset hosted in Datadog. Args: @@ -157,34 +163,40 @@ def from_datadog(cls, name: str) -> 'Dataset': headers = { "DD-API-KEY": os.getenv("DD_API_KEY"), "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json" + "Content-Type": "application/json", } try: # Get dataset ID encoded_name = quote(name) url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") - datasets = response_data.get('data', []) + response_data = _make_request( + conn, headers, "GET", url, context="Dataset lookup" + ) + datasets = response_data.get("data", []) if not datasets: raise ValueError(f"Dataset '{name}' not found") - dataset_id = datasets[0]['id'] + dataset_id = datasets[0]["id"] # Get dataset records url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - records_data = _make_request(conn, headers, "GET", url, context="Records lookup") - + records_data = _make_request( + conn, headers, "GET", url, context="Records lookup" + ) + # Transform records into the expected format class_records = [] - for record in records_data.get('data', []): - attrs = record.get('attributes', {}) - class_records.append({ - "input": attrs.get('input', {}), - "expected_output": attrs.get('expected_output', {}), - **attrs.get('metadata', {}) - }) + for record in records_data.get("data", []): + attrs = record.get("attributes", {}) + class_records.append( + { + "input": attrs.get("input", {}), + "expected_output": attrs.get("expected_output", {}), + **attrs.get("metadata", {}), + } + ) # Create new dataset instance dataset = cls(name, class_records) @@ -196,7 +208,7 @@ def from_datadog(cls, name: str) -> 'Dataset': def push(self) -> Dict[str, str]: """Push the dataset to Datadog. - + Returns: Dict[str, str]: Dictionary containing dataset information including: - dataset_id: The ID of the created/updated dataset @@ -208,15 +220,17 @@ def push(self) -> Dict[str, str]: headers = { "DD-API-KEY": os.getenv("DD_API_KEY"), "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json" + "Content-Type": "application/json", } try: # Check if dataset exists encoded_name = quote(self.name) url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") - datasets = response_data.get('data', []) + response_data = _make_request( + conn, headers, "GET", url, context="Dataset lookup" + ) + datasets = response_data.get("data", []) if not datasets: # Create new dataset @@ -226,9 +240,10 @@ def push(self) -> Dict[str, str]: "type": "datasets", "attributes": { "name": self.name, - "description": self.description or f"Dataset used for {self.name}", - "metadata": {"team": "ml-obs"} - } + "description": self.description + or f"Dataset used for {self.name}", + "metadata": {"team": "ml-obs"}, + }, } } response_data = _make_request( @@ -237,24 +252,26 @@ def push(self) -> Dict[str, str]: "POST", "/api/unstable/llm-obs/v1/datasets", body=json.dumps(dataset_payload), - context="Dataset creation" + context="Dataset creation", ) - dataset_id = response_data['data']['id'] + dataset_id = response_data["data"]["id"] self.datadog_dataset_id = dataset_id else: # Dataset exists, create a new version - dataset_id = datasets[0]['id'] + dataset_id = datasets[0]["id"] version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") new_dataset_name = f"{self.name}-{version_suffix}" - print(f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'.") + print( + f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'." + ) dataset_payload = { "data": { "type": "datasets", "attributes": { "name": new_dataset_name, "description": f"Dataset versioned on {version_suffix} used for {self.name}", - "metadata": {"team": "ml-obs"} - } + "metadata": {"team": "ml-obs"}, + }, } } response_data = _make_request( @@ -263,36 +280,36 @@ def push(self) -> Dict[str, str]: "POST", "/api/unstable/llm-obs/v1/datasets", body=json.dumps(dataset_payload), - context="Dataset version creation" + context="Dataset version creation", ) - dataset_id = response_data['data']['id'] + dataset_id = response_data["data"]["id"] self.datadog_dataset_id = dataset_id self.name = new_dataset_name # Add records to the dataset records_payload = { - "data": { - "type": "datasets", - "attributes": { - "records": self.data - } - } + "data": {"type": "datasets", "attributes": {"records": self.data}} } url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - _make_request(conn, headers, "POST", url, body=json.dumps(records_payload), context="Adding records") + _make_request( + conn, + headers, + "POST", + url, + body=json.dumps(records_payload), + context="Adding records", + ) print(f"✓ Successfully uploaded dataset '{self.name}'") print(f" • Dataset ID: {dataset_id}") print(f" • Records uploaded: {len(self.data)}") - + return self finally: conn.close() - - class Experiment: """Manages the execution and evaluation of LLM tasks on a dataset. @@ -310,7 +327,15 @@ class Experiment: results (ExperimentResults): Results after running the experiment """ - def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable], tags: List[str] = [], project_name: str = "-") -> None: + def __init__( + self, + name: str, + task: Callable, + dataset: Dataset, + evaluators: List[Callable], + tags: List[str] = [], + project_name: str = "-", + ) -> None: self.name = name self.task = task self.dataset = dataset @@ -321,15 +346,16 @@ def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List self.has_run = False self.results = None - def __repr__(self) -> str: - separator = "+" + "-"*20 + "+" + "-"*50 + "+" - + separator = "+" + "-" * 20 + "+" + "-" * 50 + "+" + def format_evaluator(evaluator: Callable) -> str: return f"{evaluator.__name__}" evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators] - evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" + evaluators = ( + ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" + ) table = ( f"{separator}\n" @@ -358,9 +384,11 @@ def _validate_tags(self) -> None: """ for tag in self.tags: if not isinstance(tag, str) or ":" not in tag: - raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.") + raise ValueError( + f"Invalid tag format: {tag}. Tags should be in the format 'key:value'." + ) - def run(self, _jobs: int = 10) -> 'ExperimentResults': + def run(self, _jobs: int = 10) -> "ExperimentResults": """Execute the experiment on the dataset. Runs the task function on each dataset record in parallel and collects @@ -391,10 +419,11 @@ def process_row(idx_row): end_time = time.time() duration = end_time - start_time - # Evaluate the output - evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} + evaluations = { + evaluator.__name__: evaluator(row, output) + for evaluator in self.evaluators + } - # Store the result return { "idx": idx, "result": { @@ -409,11 +438,10 @@ def process_row(idx_row): "dataset_name": self.dataset.name, }, "tags": self.tags, - "error": None - } + "error": None, + }, } except Exception as e: - # Handle exceptions and store the error return { "idx": idx, "result": { @@ -428,48 +456,47 @@ def process_row(idx_row): "dataset_name": self.dataset.name, }, "tags": self.tags, - "error": str(e) - } + "error": str(e), + }, } with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - # Create futures list first future_to_idx = { - executor.submit(process_row, (idx, row)): idx + executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset) } - + # Process as they complete while maintaining order completed = 0 results_buffer = [None] * total_rows for future in concurrent.futures.as_completed(future_to_idx): idx = future_to_idx[future] - results_buffer[idx] = future.result()['result'] + results_buffer[idx] = future.result()["result"] completed += 1 # Update progress progress = int(50 * completed / total_rows) - bar = '=' * progress + ' ' * (50 - progress) + bar = "=" * progress + " " * (50 - progress) percent = int(100 * completed / total_rows) - sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})') + sys.stdout.write( + f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" + ) sys.stdout.flush() # Add results in correct order results.experiment_rows = results_buffer # Print a new line after completion - sys.stdout.write('\n') + sys.stdout.write("\n") self.has_run = True self.results = results return results - def get_results(self) -> Union['ExperimentResults', List['ExperimentResults']]: + def get_results(self) -> Union["ExperimentResults", List["ExperimentResults"]]: if not self.has_run: raise ValueError("Experiment has not been run yet") return self.results - - class ExperimentResults: @@ -490,12 +517,25 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None: self.experiment_rows = [] def __repr__(self) -> str: - separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + separator = ( + "+" + + "-" * 10 + + "+" + + "-" * 38 + + "+" + + "-" * 38 + + "+" + + "-" * 38 + + "+" + + "-" * 38 + + "+" + ) def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: if isinstance(d, dict): + def truncate(value: str) -> str: - return (value[:17] + '...') if len(value) > 20 else value + return (value[:17] + "...") if len(value) > 20 else value return [f"{key}: {truncate(str(value))}" for key, value in d.items()] elif isinstance(d, list): @@ -506,22 +546,29 @@ def truncate(value: str) -> str: def format_entries(entries): formatted_rows = [] for i, entry in enumerate(entries): - dataset_idx = entry['metadata']['dataset_record_idx'] + dataset_idx = entry["metadata"]["dataset_record_idx"] dataset_entry = self.dataset[dataset_idx] - input_lines = format_dict(dataset_entry['input']) - expected_output_lines = format_dict(dataset_entry.get('expected_output', {})) - output_lines = format_dict(entry['output']) - evaluations_lines = format_dict(entry.get('evaluations', [])) - + input_lines = format_dict(dataset_entry["input"]) + expected_output_lines = format_dict( + dataset_entry.get("expected_output", {}) + ) + output_lines = format_dict(entry["output"]) + evaluations_lines = format_dict(entry.get("evaluations", [])) + # Determine the maximum number of lines across all fields - max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines)) - + max_lines = max( + len(input_lines), + len(expected_output_lines), + len(output_lines), + len(evaluations_lines), + ) + # Pad the lists to have the same number of lines - input_lines += [''] * (max_lines - len(input_lines)) - expected_output_lines += [''] * (max_lines - len(expected_output_lines)) - output_lines += [''] * (max_lines - len(output_lines)) - evaluations_lines += [''] * (max_lines - len(evaluations_lines)) - + input_lines += [""] * (max_lines - len(input_lines)) + expected_output_lines += [""] * (max_lines - len(expected_output_lines)) + output_lines += [""] * (max_lines - len(output_lines)) + evaluations_lines += [""] * (max_lines - len(evaluations_lines)) + for j in range(max_lines): if j == 0: index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" @@ -544,20 +591,22 @@ def format_entries(entries): f"{separator}\n" f"{entries}" ) - return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" + return ( + f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" + ) def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self.experiment_rows) - + def __len__(self) -> int: return len(self.experiment_rows) def __getitem__(self, index: int) -> Any: return self.experiment_rows[index] - + def push(self) -> Dict[str, str]: """Push the experiment results to Datadog. - + Returns: Dict[str, str]: Dictionary containing experiment information including: - experiment_id: The ID of the created experiment @@ -565,32 +614,36 @@ def push(self) -> Dict[str, str]: - span_count: Number of spans uploaded """ _validate_api_keys() - + # Initialize connection and headers conn = HTTPSConnection(BASE_URL) headers = { "DD-API-KEY": os.getenv("DD_API_KEY"), "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json" + "Content-Type": "application/json", } try: # Check if project exists url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" - response_data = _make_request(conn, headers, "GET", url, context="Project lookup") - projects = response_data.get('data', []) + response_data = _make_request( + conn, headers, "GET", url, context="Project lookup" + ) + projects = response_data.get("data", []) if not projects: # Create new project - print(f"Project '{self.experiment.project_name}' not found. Creating it.") + print( + f"Project '{self.experiment.project_name}' not found. Creating it." + ) project_payload = { "data": { "type": "projects", "attributes": { "name": self.experiment.project_name, "description": f"Project for {self.experiment.project_name}", - "metadata": {"team": "ml-obs"} - } + "metadata": {"team": "ml-obs"}, + }, } } response_data = _make_request( @@ -599,17 +652,19 @@ def push(self) -> Dict[str, str]: "POST", "/api/unstable/llm-obs/v1/projects", body=json.dumps(project_payload), - context="Project creation" + context="Project creation", ) - project_id = response_data['data']['id'] + project_id = response_data["data"]["id"] else: - project_id = projects[0]['id'] + project_id = projects[0]["id"] # Check if experiment exists encoded_name = quote(self.experiment.name) url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup") - experiments = response_data.get('data', []) + response_data = _make_request( + conn, headers, "GET", url, context="Experiment lookup" + ) + experiments = response_data.get("data", []) if not experiments: # Create new experiment @@ -624,9 +679,9 @@ def push(self) -> Dict[str, str]: "project_id": project_id, "metadata": { "tags": self.experiment.tags, - "team": "ml-obs" - } - } + "team": "ml-obs", + }, + }, } } response_data = _make_request( @@ -635,14 +690,16 @@ def push(self) -> Dict[str, str]: "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload), - context="Experiment creation" + context="Experiment creation", ) - experiment_id = response_data['data']['id'] + experiment_id = response_data["data"]["id"] else: # Experiment exists, create a new version version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") new_experiment_name = f"{self.experiment.name}-{version_suffix}" - print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.") + print( + f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'." + ) experiment_payload = { "data": { "type": "experiments", @@ -653,9 +710,9 @@ def push(self) -> Dict[str, str]: "project_id": project_id, "metadata": { "tags": self.experiment.tags, - "team": "ml-obs" - } - } + "team": "ml-obs", + }, + }, } } response_data = _make_request( @@ -664,51 +721,49 @@ def push(self) -> Dict[str, str]: "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload), - context="Experiment version creation" + context="Experiment version creation", ) - experiment_id = response_data['data']['id'] + experiment_id = response_data["data"]["id"] self.experiment.name = new_experiment_name - # Prepare and send experiment results spans = [] metrics = [] - - for idx, result in enumerate(self.experiment_rows): - + span = { "span_id": _make_id(), "project_id": project_id, "experiment_id": experiment_id, "dataset_id": self.experiment.dataset.datadog_dataset_id, "dataset_record_id": _make_id(), - "start_ns": int(result['metadata']['timestamp'] * 1e9), - "duration": float(result['metadata']['duration'] * 1e9), + "start_ns": int(result["metadata"]["timestamp"] * 1e9), + "duration": float(result["metadata"]["duration"] * 1e9), "tags": self.experiment.tags, "status": "ok", "meta": { "span": {"kind": "experiment"}, - "input": self.experiment.dataset[idx]['input'], - "output": result['output'], - "expected_output": self.experiment.dataset[idx].get('expected_output', {}), + "input": self.experiment.dataset[idx]["input"], + "output": result["output"], + "expected_output": self.experiment.dataset[idx].get( + "expected_output", {} + ), "error": { - "message": result['error'], + "message": result["error"], "stack": None, - "type": None - } - } - + "type": None, + }, + }, } spans.append(span) # Add evaluation metrics - for metric_name, metric_value in result['evaluations'].items(): - timestamp_ms = int(result['metadata']['timestamp'] * 1000) - + for metric_name, metric_value in result["evaluations"].items(): + timestamp_ms = int(result["metadata"]["timestamp"] * 1000) + if isinstance(metric_value, bool): metric_value = 1 if metric_value else 0 - metric_type = "score" + metric_type = "score" elif isinstance(metric_value, (int, float)): metric_type = "score" else: @@ -716,25 +771,23 @@ def push(self) -> Dict[str, str]: metric_value = str(metric_value) metric = { - "span_id": span['span_id'], + "span_id": span["span_id"], "metric_type": metric_type, "timestamp_ms": timestamp_ms, "label": metric_name, - "score_value" if metric_type == "score" else "categorical_value": metric_value + "score_value" + if metric_type == "score" + else "categorical_value": metric_value, } metrics.append(metric) results_payload = { "data": { "type": "experiments", - "attributes": { - "spans": spans, - "metrics": metrics - } + "attributes": {"spans": spans, "metrics": metrics}, } } - url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" _make_request( conn, @@ -742,14 +795,16 @@ def push(self) -> Dict[str, str]: "POST", url, body=json.dumps(results_payload), - context="Publishing results" + context="Publishing results", ) - print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'") + print( + f"✓ Successfully uploaded experiment results for '{self.experiment.name}'" + ) print(f" • Experiment ID: {experiment_id}") print(f" • Spans uploaded: {len(spans)}") print(f" • Metrics uploaded: {len(metrics)}") - + return self finally: @@ -762,7 +817,7 @@ def _make_request( method: str, url: str, body: Optional[Any] = None, - context: str = "" + context: str = "", ) -> Dict[str, Any]: """Make an HTTP request to the Datadog API. @@ -787,26 +842,26 @@ def _make_request( conn.request(method, url, headers=headers) else: if body is not None and isinstance(body, str): - body = body.encode('utf-8') + body = body.encode("utf-8") conn.request(method, url, body=body, headers=headers) - + response = conn.getresponse() response_body = response.read() - + if response.status >= 400: error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}" raise Exception(error_message) - - # Add handling for empty response + if not response_body: - return {} # Return empty dict for empty responses - + return {} + try: return json.loads(response_body) except json.JSONDecodeError: error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}" raise Exception(error_message) + def _make_id() -> str: """Generate a unique identifier. @@ -815,17 +870,18 @@ def _make_id() -> str: """ return uuid.uuid4().hex + def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: """Decorator that creates multiple versions of a function with different parameter combinations. - + Creates multiple versions of a function by generating all possible combinations of the provided parameters. Each generated function variant includes tags indicating its parameter values. - + Args: **param_dict: Dictionary of parameter names and their possible values. Values can be single items or lists of possible values. - + Returns: Callable: Decorator function that generates parameterized versions of the input function @@ -836,30 +892,30 @@ def my_function(text, model, temperature): # of model and temperature parameters pass """ + def decorator(func): # Generate all combinations of parameters param_names = list(param_dict.keys()) - param_values = [param_dict[name] if isinstance(param_dict[name], (list, tuple)) - else [param_dict[name]] for name in param_names] - param_combinations = [dict(zip(param_names, combo)) - for combo in itertools.product(*param_values)] - + param_values = [ + param_dict[name] + if isinstance(param_dict[name], (list, tuple)) + else [param_dict[name]] + for name in param_names + ] + param_combinations = [ + dict(zip(param_names, combo)) for combo in itertools.product(*param_values) + ] + # Create a new function for each parameter combination def create_parameterized_func(params): def wrapped_func(*args, **kwargs): return func(*args, **{**kwargs, **params}) - - # Create a descriptive name for the function + param_str = "-".join(f"{k}={v}" for k, v in params.items()) wrapped_func.__name__ = f"{func.__name__}_{param_str}" wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()] return wrapped_func - - return [create_parameterized_func(combo) for combo in param_combinations] - - return decorator - - - + return [create_parameterized_func(combo) for combo in param_combinations] + return decorator From e73a897cb892c4b7dfbfe39eb335f0a8881c34ec Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 10:55:53 -0400 Subject: [PATCH 06/36] Add custom exception classes --- ddtrace/llmobs/experiments.py | 181 ++++++++++++++++------------------ 1 file changed, 86 insertions(+), 95 deletions(-) diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py index ce159c752c8..93ec0bab7f6 100644 --- a/ddtrace/llmobs/experiments.py +++ b/ddtrace/llmobs/experiments.py @@ -15,24 +15,6 @@ BASE_URL = "api.datadoghq.com" -def _validate_api_keys() -> None: - """Validate that required Datadog API keys are set in environment variables. - - Raises: - ValueError: If any required API keys are missing from environment variables - """ - missing_keys = [] - for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]: - if not os.getenv(key): - missing_keys.append(key) - - if missing_keys: - raise ValueError( - f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. " - "Please set these environment variables before pushing to Datadog." - ) - - class Dataset: """A container for LLM experiment data that can be pushed to and retrieved from Datadog. @@ -68,7 +50,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]: def __repr__(self) -> str: header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n" - separator = "+" + "-" * 10 + "+" + "-" * 38 + "+" + "-" * 38 + "+" + separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+" def format_dict(d: Dict[str, Any]) -> List[str]: def truncate(value: str) -> str: @@ -347,7 +329,7 @@ def __init__( self.results = None def __repr__(self) -> str: - separator = "+" + "-" * 20 + "+" + "-" * 50 + "+" + separator = f"+{'-' * 20}+{'-' * 50}+" def format_evaluator(evaluator: Callable) -> str: return f"{evaluator.__name__}" @@ -517,19 +499,7 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None: self.experiment_rows = [] def __repr__(self) -> str: - separator = ( - "+" - + "-" * 10 - + "+" - + "-" * 38 - + "+" - + "-" * 38 - + "+" - + "-" * 38 - + "+" - + "-" * 38 - + "+" - ) + separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+{'-' * 38}+{'-' * 38}+" def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: if isinstance(d, dict): @@ -591,9 +561,7 @@ def format_entries(entries): f"{separator}\n" f"{entries}" ) - return ( - f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" - ) + return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self.experiment_rows) @@ -811,6 +779,57 @@ def push(self) -> Dict[str, str]: conn.close() +def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: + """Decorator that creates multiple versions of a function with different parameter combinations. + + Creates multiple versions of a function by generating all possible combinations + of the provided parameters. Each generated function variant includes tags + indicating its parameter values. + + Args: + **param_dict: Dictionary of parameter names and their possible values. + Values can be single items or lists of possible values. + + Returns: + Callable: Decorator function that generates parameterized versions of the input function + + Example: + @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7]) + def my_function(text, model, temperature): + # This will create 4 versions of the function with different combinations + # of model and temperature parameters + pass + """ + + def decorator(func): + # Generate all combinations of parameters + param_names = list(param_dict.keys()) + param_values = [ + param_dict[name] + if isinstance(param_dict[name], (list, tuple)) + else [param_dict[name]] + for name in param_names + ] + param_combinations = [ + dict(zip(param_names, combo)) for combo in itertools.product(*param_values) + ] + + # Create a new function for each parameter combination + def create_parameterized_func(params): + def wrapped_func(*args, **kwargs): + return func(*args, **{**kwargs, **params}) + + param_str = "-".join(f"{k}={v}" for k, v in params.items()) + wrapped_func.__name__ = f"{func.__name__}_{param_str}" + wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()] + return wrapped_func + + return [create_parameterized_func(combo) for combo in param_combinations] + + return decorator + + + def _make_request( conn: HTTPSConnection, headers: Dict[str, Any], @@ -821,22 +840,9 @@ def _make_request( ) -> Dict[str, Any]: """Make an HTTP request to the Datadog API. - Handles making HTTP requests to Datadog's API with proper error handling - and response parsing. - - Args: - conn: The HTTP connection to use - headers: Request headers - method: HTTP method (GET, POST, etc.) - url: Request URL - body: Request body (optional) - context: Context string for error messages (optional) - - Returns: - Dict[str, Any]: Parsed JSON response - Raises: - Exception: If the request fails, returns an error status, or returns invalid JSON + DatadogAPIError: If the request fails or returns an error status + DatadogResponseError: If the response contains invalid JSON """ if method == "GET": conn.request(method, url, headers=headers) @@ -847,10 +853,11 @@ def _make_request( response = conn.getresponse() response_body = response.read() + response_text = response_body.decode('utf-8') if response.status >= 400: - error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}" - raise Exception(error_message) + error_message = f"HTTP {response.status} Error during {context}: {response.reason}" + raise DatadogAPIError(error_message, status_code=response.status, response=response_text) if not response_body: return {} @@ -858,8 +865,8 @@ def _make_request( try: return json.loads(response_body) except json.JSONDecodeError: - error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}" - raise Exception(error_message) + error_message = f"Invalid JSON response during {context}. Status: {response.status}" + raise DatadogResponseError(error_message, raw_response=response_text) def _make_id() -> str: @@ -871,51 +878,35 @@ def _make_id() -> str: return uuid.uuid4().hex -def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: - """Decorator that creates multiple versions of a function with different parameter combinations. +class DatadogAPIError(Exception): + """Raised when there is an error interacting with the Datadog API.""" + def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None): + self.status_code = status_code + self.response = response + super().__init__(message) - Creates multiple versions of a function by generating all possible combinations - of the provided parameters. Each generated function variant includes tags - indicating its parameter values. +class DatadogResponseError(Exception): + """Raised when there is an error parsing the response from Datadog.""" + def __init__(self, message: str, raw_response: Optional[str] = None): + self.raw_response = raw_response + super().__init__(message) - Args: - **param_dict: Dictionary of parameter names and their possible values. - Values can be single items or lists of possible values. - Returns: - Callable: Decorator function that generates parameterized versions of the input function +def _validate_api_keys() -> None: + """Validate that required Datadog API keys are set in environment variables. - Example: - @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7]) - def my_function(text, model, temperature): - # This will create 4 versions of the function with different combinations - # of model and temperature parameters - pass + Raises: + ValueError: If any required API keys are missing from environment variables """ + missing_keys = [] + for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]: + if not os.getenv(key): + missing_keys.append(key) - def decorator(func): - # Generate all combinations of parameters - param_names = list(param_dict.keys()) - param_values = [ - param_dict[name] - if isinstance(param_dict[name], (list, tuple)) - else [param_dict[name]] - for name in param_names - ] - param_combinations = [ - dict(zip(param_names, combo)) for combo in itertools.product(*param_values) - ] - - # Create a new function for each parameter combination - def create_parameterized_func(params): - def wrapped_func(*args, **kwargs): - return func(*args, **{**kwargs, **params}) - - param_str = "-".join(f"{k}={v}" for k, v in params.items()) - wrapped_func.__name__ = f"{func.__name__}_{param_str}" - wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()] - return wrapped_func + if missing_keys: + raise ValueError( + f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. " + "Please set these environment variables before pushing to Datadog." + ) - return [create_parameterized_func(combo) for combo in param_combinations] - return decorator From f8c9ef0ebd182a5c2b954e4752c3f7a1dcc3c350 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 11:24:23 -0400 Subject: [PATCH 07/36] Move code to another directory --- ddtrace/llmobs/experiments/__init__.py | 3 +++ ddtrace/llmobs/{experiments.py => experiments/_experiments.py} | 0 2 files changed, 3 insertions(+) create mode 100644 ddtrace/llmobs/experiments/__init__.py rename ddtrace/llmobs/{experiments.py => experiments/_experiments.py} (100%) diff --git a/ddtrace/llmobs/experiments/__init__.py b/ddtrace/llmobs/experiments/__init__.py new file mode 100644 index 00000000000..2979d72aebf --- /dev/null +++ b/ddtrace/llmobs/experiments/__init__.py @@ -0,0 +1,3 @@ +from ._experiments import Dataset, Experiment, parametrize + +__all__ = ["Dataset", "Experiment", "parametrize"] \ No newline at end of file diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments/_experiments.py similarity index 100% rename from ddtrace/llmobs/experiments.py rename to ddtrace/llmobs/experiments/_experiments.py From 59577e1bcedaa7a5f728580581d59bda032f88ec Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 11:29:31 -0400 Subject: [PATCH 08/36] Change experiments module export --- ddtrace/llmobs/{experiments => }/_experiments.py | 0 ddtrace/llmobs/experiments/__init__.py | 3 --- 2 files changed, 3 deletions(-) rename ddtrace/llmobs/{experiments => }/_experiments.py (100%) delete mode 100644 ddtrace/llmobs/experiments/__init__.py diff --git a/ddtrace/llmobs/experiments/_experiments.py b/ddtrace/llmobs/_experiments.py similarity index 100% rename from ddtrace/llmobs/experiments/_experiments.py rename to ddtrace/llmobs/_experiments.py diff --git a/ddtrace/llmobs/experiments/__init__.py b/ddtrace/llmobs/experiments/__init__.py deleted file mode 100644 index 2979d72aebf..00000000000 --- a/ddtrace/llmobs/experiments/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._experiments import Dataset, Experiment, parametrize - -__all__ = ["Dataset", "Experiment", "parametrize"] \ No newline at end of file From 402d4021d61a1e2ca54927b3accac44c7fd63ee3 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 11:31:41 -0400 Subject: [PATCH 09/36] Use f strings --- ddtrace/llmobs/_experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 93ec0bab7f6..572c8ab5870 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -458,7 +458,7 @@ def process_row(idx_row): # Update progress progress = int(50 * completed / total_rows) - bar = "=" * progress + " " * (50 - progress) + bar = f"{'=' * progress}{' ' * (50 - progress)}" percent = int(100 * completed / total_rows) sys.stdout.write( f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" From 2c281c5df225332c7b9344f8c3f2f1b4749a91af Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 29 Oct 2024 16:48:19 -0400 Subject: [PATCH 10/36] Decouple running from evaluating --- ddtrace/llmobs/_experiments.py | 162 ++++++++++++++++++++++++--------- 1 file changed, 118 insertions(+), 44 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 572c8ab5870..41a5cef3804 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -306,7 +306,9 @@ class Experiment: tags (List[str]): Tags for organizing experiments project_name (str): Name of the project this experiment belongs to has_run (bool): Whether the experiment has been executed - results (ExperimentResults): Results after running the experiment + has_evaluated (bool): Whether the evaluations have been performed + outputs (List[Dict]): Outputs after running the task + results (ExperimentResults): Results after running evaluations """ def __init__( @@ -322,10 +324,12 @@ def __init__( self.task = task self.dataset = dataset self.evaluators = evaluators - self.tags = [] + self.tags = tags self.project_name = project_name # Post-run attributes self.has_run = False + self.has_evaluated = False + self.outputs = [] self.results = None def __repr__(self) -> str: @@ -370,26 +374,23 @@ def _validate_tags(self) -> None: f"Invalid tag format: {tag}. Tags should be in the format 'key:value'." ) - def run(self, _jobs: int = 10) -> "ExperimentResults": - """Execute the experiment on the dataset. + def run(self, _jobs: int = 10) -> None: + """Execute the experiment tasks on the dataset without performing evaluations. - Runs the task function on each dataset record in parallel and collects - results and evaluations. + Runs the task function on each dataset record in parallel and stores + the outputs and metadata. Args: _jobs (int, optional): Number of parallel workers. Defaults to 10. Must be between 1 and 20. - Returns: - ExperimentResults: Object containing the experiment results - Raises: ValueError: If _jobs is not between 1 and 20 """ if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") - results = ExperimentResults(self.dataset, self) + self.outputs = [] total_rows = len(self.dataset) def process_row(idx_row): @@ -401,42 +402,117 @@ def process_row(idx_row): end_time = time.time() duration = end_time - start_time + return { + "idx": idx, + "output": output, + "metadata": { + "timestamp": start_time, + "duration": duration, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": None, + } + except Exception as e: + return { + "idx": idx, + "output": None, + "metadata": { + "timestamp": time.time(), + "duration": 0, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": str(e), + } + + with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: + future_to_idx = { + executor.submit(process_row, (idx, row)): idx + for idx, row in enumerate(self.dataset) + } + + # Process as they complete while maintaining order + completed = 0 + outputs_buffer = [None] * total_rows + for future in concurrent.futures.as_completed(future_to_idx): + idx = future_to_idx[future] + outputs_buffer[idx] = future.result() + completed += 1 + + # Update progress + progress = int(50 * completed / total_rows) + bar = f"{'=' * progress}{' ' * (50 - progress)}" + percent = int(100 * completed / total_rows) + sys.stdout.write( + f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" + ) + sys.stdout.flush() + + self.outputs = outputs_buffer + + sys.stdout.write("\n") + + self.has_run = True + + return self + + def eval(self, _jobs: int = 10) -> "ExperimentResults": + """Evaluate the outputs using the provided evaluators. + + Runs the evaluators on each output in parallel and collects evaluations. + + Args: + _jobs (int, optional): Number of parallel workers. Defaults to 10. + Must be between 1 and 20. + + Returns: + ExperimentResults: Object containing the experiment results + + Raises: + ValueError: If _jobs is not between 1 and 20 + ValueError: If the experiment has not been run yet + """ + if not 1 <= _jobs <= 20: + raise ValueError("Number of jobs must be between 1 and 20") + + if not self.has_run: + raise ValueError("Experiment has not been run yet. Please call run() before eval().") + + results = ExperimentResults(self.dataset, self) + total_rows = len(self.outputs) + + def evaluate_output(idx_output): + idx, output_data = idx_output + try: + idx_in_dataset = output_data["metadata"]["dataset_record_idx"] + row = self.dataset[idx_in_dataset] + output = output_data["output"] evaluations = { evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators } - return { - "idx": idx, - "result": { - "output": output, - "evaluations": evaluations, - "metadata": { - "timestamp": start_time, - "duration": duration, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "tags": self.tags, - "error": None, - }, + result = { + "output": output, + "evaluations": evaluations, + "metadata": output_data["metadata"], + "tags": self.tags, + "error": output_data["error"], } + + return {"idx": idx, "result": result} except Exception as e: return { "idx": idx, "result": { - "output": None, + "output": output_data["output"], "evaluations": {}, - "metadata": { - "timestamp": time.time(), - "duration": 0, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, + "metadata": output_data["metadata"], "tags": self.tags, "error": str(e), }, @@ -444,8 +520,8 @@ def process_row(idx_row): with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: future_to_idx = { - executor.submit(process_row, (idx, row)): idx - for idx, row in enumerate(self.dataset) + executor.submit(evaluate_output, (idx, output_data)): idx + for idx, output_data in enumerate(self.outputs) } # Process as they complete while maintaining order @@ -461,23 +537,21 @@ def process_row(idx_row): bar = f"{'=' * progress}{' ' * (50 - progress)}" percent = int(100 * completed / total_rows) sys.stdout.write( - f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" + f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" ) sys.stdout.flush() - # Add results in correct order results.experiment_rows = results_buffer - # Print a new line after completion sys.stdout.write("\n") - self.has_run = True + self.has_evaluated = True self.results = results return results - def get_results(self) -> Union["ExperimentResults", List["ExperimentResults"]]: - if not self.has_run: - raise ValueError("Experiment has not been run yet") + def get_results(self) -> 'ExperimentResults': + if not self.has_evaluated: + raise ValueError("Evaluations have not been performed yet. Please call eval() after run().") return self.results From 0e421da52b95f539661ec51c3f1c4e42b73ff064 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 4 Nov 2024 15:47:09 -0500 Subject: [PATCH 11/36] Change parametrize function to make it simpler --- ddtrace/llmobs/_experiments.py | 219 +++++++++++++++++---------------- 1 file changed, 112 insertions(+), 107 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 41a5cef3804..ed4f23ff363 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -216,14 +216,12 @@ def push(self) -> Dict[str, str]: if not datasets: # Create new dataset - print(f"Dataset '{self.name}' not found. Creating it.") dataset_payload = { "data": { "type": "datasets", "attributes": { "name": self.name, - "description": self.description - or f"Dataset used for {self.name}", + "description": self.description, "metadata": {"team": "ml-obs"}, }, } @@ -239,34 +237,11 @@ def push(self) -> Dict[str, str]: dataset_id = response_data["data"]["id"] self.datadog_dataset_id = dataset_id else: - # Dataset exists, create a new version - dataset_id = datasets[0]["id"] - version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - new_dataset_name = f"{self.name}-{version_suffix}" - print( - f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'." - ) - dataset_payload = { - "data": { - "type": "datasets", - "attributes": { - "name": new_dataset_name, - "description": f"Dataset versioned on {version_suffix} used for {self.name}", - "metadata": {"team": "ml-obs"}, - }, - } - } - response_data = _make_request( - conn, - headers, - "POST", - "/api/unstable/llm-obs/v1/datasets", - body=json.dumps(dataset_payload), - context="Dataset version creation", + # Dataset exists, raise error + raise ValueError( + f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. " + "Please use a different name for your dataset." ) - dataset_id = response_data["data"]["id"] - self.datadog_dataset_id = dataset_id - self.name = new_dataset_name # Add records to the dataset records_payload = { @@ -319,6 +294,8 @@ def __init__( evaluators: List[Callable], tags: List[str] = [], project_name: str = "-", + description: str = "", + metadata: Dict[str, Any] = {}, ) -> None: self.name = name self.task = task @@ -326,6 +303,8 @@ def __init__( self.evaluators = evaluators self.tags = tags self.project_name = project_name + self.description = description + self.metadata = metadata # Post-run attributes self.has_run = False self.has_evaluated = False @@ -459,7 +438,7 @@ def process_row(idx_row): self.has_run = True - return self + return self.eval() def eval(self, _jobs: int = 10) -> "ExperimentResults": """Evaluate the outputs using the provided evaluators. @@ -532,15 +511,6 @@ def evaluate_output(idx_output): results_buffer[idx] = future.result()["result"] completed += 1 - # Update progress - progress = int(50 * completed / total_rows) - bar = f"{'=' * progress}{' ' * (50 - progress)}" - percent = int(100 * completed / total_rows) - sys.stdout.write( - f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" - ) - sys.stdout.flush() - results.experiment_rows = results_buffer sys.stdout.write("\n") @@ -675,15 +645,12 @@ def push(self) -> Dict[str, str]: if not projects: # Create new project - print( - f"Project '{self.experiment.project_name}' not found. Creating it." - ) project_payload = { "data": { "type": "projects", "attributes": { "name": self.experiment.project_name, - "description": f"Project for {self.experiment.project_name}", + "description": "", "metadata": {"team": "ml-obs"}, }, } @@ -710,18 +677,17 @@ def push(self) -> Dict[str, str]: if not experiments: # Create new experiment - print(f"Experiment '{self.experiment.name}' not found. Creating it.") experiment_payload = { "data": { "type": "experiments", "attributes": { "name": self.experiment.name, - "description": f"Experiment: {self.experiment.name} on dataset: {self.experiment.dataset.name}", + "description": self.experiment.description, "dataset_id": self.experiment.dataset.datadog_dataset_id, "project_id": project_id, "metadata": { "tags": self.experiment.tags, - "team": "ml-obs", + **self.experiment.metadata, }, }, } @@ -747,12 +713,12 @@ def push(self) -> Dict[str, str]: "type": "experiments", "attributes": { "name": new_experiment_name, - "description": f"Experiment versioned on {version_suffix} used for {self.experiment.name}", + "description": self.experiment.description, "dataset_id": self.experiment.dataset.datadog_dataset_id, "project_id": project_id, "metadata": { "tags": self.experiment.tags, - "team": "ml-obs", + **self.experiment.metadata, }, }, } @@ -783,6 +749,8 @@ def push(self) -> Dict[str, str]: "duration": float(result["metadata"]["duration"] * 1e9), "tags": self.experiment.tags, "status": "ok", + "metrics": { # TODO: Fill in with actual metrics once we have tracing and llm spans + }, "meta": { "span": {"kind": "experiment"}, "input": self.experiment.dataset[idx]["input"], @@ -802,10 +770,11 @@ def push(self) -> Dict[str, str]: # Add evaluation metrics for metric_name, metric_value in result["evaluations"].items(): timestamp_ms = int(result["metadata"]["timestamp"] * 1000) - + + # Check for bool first, since bool is a subclass of int if isinstance(metric_value, bool): - metric_value = 1 if metric_value else 0 - metric_type = "score" + metric_type = "categorical" + metric_value = str(metric_value).lower() elif isinstance(metric_value, (int, float)): metric_type = "score" else: @@ -817,12 +786,16 @@ def push(self) -> Dict[str, str]: "metric_type": metric_type, "timestamp_ms": timestamp_ms, "label": metric_name, - "score_value" - if metric_type == "score" - else "categorical_value": metric_value, } + + if metric_type == "score": + metric["score_value"] = metric_value + else: + metric["categorical_value"] = metric_value + metrics.append(metric) + print(metrics) results_payload = { "data": { "type": "experiments", @@ -830,6 +803,8 @@ def push(self) -> Dict[str, str]: } } + + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" _make_request( conn, @@ -853,57 +828,6 @@ def push(self) -> Dict[str, str]: conn.close() -def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: - """Decorator that creates multiple versions of a function with different parameter combinations. - - Creates multiple versions of a function by generating all possible combinations - of the provided parameters. Each generated function variant includes tags - indicating its parameter values. - - Args: - **param_dict: Dictionary of parameter names and their possible values. - Values can be single items or lists of possible values. - - Returns: - Callable: Decorator function that generates parameterized versions of the input function - - Example: - @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7]) - def my_function(text, model, temperature): - # This will create 4 versions of the function with different combinations - # of model and temperature parameters - pass - """ - - def decorator(func): - # Generate all combinations of parameters - param_names = list(param_dict.keys()) - param_values = [ - param_dict[name] - if isinstance(param_dict[name], (list, tuple)) - else [param_dict[name]] - for name in param_names - ] - param_combinations = [ - dict(zip(param_names, combo)) for combo in itertools.product(*param_values) - ] - - # Create a new function for each parameter combination - def create_parameterized_func(params): - def wrapped_func(*args, **kwargs): - return func(*args, **{**kwargs, **params}) - - param_str = "-".join(f"{k}={v}" for k, v in params.items()) - wrapped_func.__name__ = f"{func.__name__}_{param_str}" - wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()] - return wrapped_func - - return [create_parameterized_func(combo) for combo in param_combinations] - - return decorator - - - def _make_request( conn: HTTPSConnection, headers: Dict[str, Any], @@ -930,7 +854,7 @@ def _make_request( response_text = response_body.decode('utf-8') if response.status >= 400: - error_message = f"HTTP {response.status} Error during {context}: {response.reason}" + error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}" raise DatadogAPIError(error_message, status_code=response.status, response=response_text) if not response_body: @@ -984,3 +908,84 @@ def _validate_api_keys() -> None: ) + +def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: + """Decorator that creates multiple versions by combining all parameter values. + + Args: + **param_dict: Dictionary of parameter names and their possible values. + Values can be single items or lists of possible values. + + Returns: + List[Any]: List of results from calling the decorated function with each parameter combination + """ + def decorator(func): + # Convert single values to lists + processed_params = { + name: [val] if not isinstance(val, (list, tuple)) else val + for name, val in param_dict.items() + } + + # Generate all combinations of parameters + param_names = list(processed_params.keys()) + param_values = [processed_params[name] for name in param_names] + param_combinations = [ + dict(zip(param_names, combo)) + for combo in itertools.product(*param_values) + ] + + # Return list of results from calling function with each combination + return [func(**params) for params in param_combinations] + + return decorator + +class Prompt: + """A class for rendering templated prompts with variables. + + Supports both simple string templates and structured chat-like templates. + + Attributes: + template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries + variables (dict): Default variables to use when rendering the template + """ + + def __init__(self, template, variables=None): + """Initialize a new Prompt. + + Args: + template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries + variables (dict, optional): Default variables to use when rendering the template. Defaults to {}. + """ + self.template = template + self.variables = variables or {} + + def render(self, **kwargs): + """Render the template with provided variables. + + Args: + **kwargs: Additional variables to use when rendering the template. + These override any default variables with the same name. + + Returns: + Union[str, List[Dict[str, str]]]: The rendered template with all variables substituted + """ + merged_vars = {**self.variables, **kwargs} + + if isinstance(self.template, str): + return self.template.format(**merged_vars) + elif isinstance(self.template, (list, tuple)): + return [ + { + k: v.format(**merged_vars) if isinstance(v, str) else v + for k, v in message.items() + } + for message in self.template + ] + else: + raise ValueError("Template must be either a string or a list of message dictionaries") + + + + def __repr__(self): + hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8] + return f"Prompt(hash={hash})" \ No newline at end of file From 173d2aea055193c7e061223ad3a6735005e4b0a6 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 5 Nov 2024 11:20:17 -0500 Subject: [PATCH 12/36] Add test file, export the top level classes --- ddtrace/llmobs/__init__.py | 5 +- ddtrace/llmobs/_experiments.py | 164 +++++++----------------- tests/llmobs/test_llmobs_experiments.py | 51 ++++++++ 3 files changed, 103 insertions(+), 117 deletions(-) create mode 100644 tests/llmobs/test_llmobs_experiments.py diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index 11100d3ed66..72596f2418e 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -7,6 +7,9 @@ """ from ._llmobs import LLMObs +from ._experiments import Dataset +from ._experiments import Experiment +from ._experiments import ExperimentResults -__all__ = ["LLMObs"] +__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index ed4f23ff363..156fab1d8cc 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -8,9 +8,9 @@ import time from urllib.parse import quote import concurrent.futures -import itertools import uuid + # Constants BASE_URL = "api.datadoghq.com" @@ -28,25 +28,23 @@ class Dataset: datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed) """ - def __init__( - self, name: str, data: List[Dict[str, Any]], description: str = "" - ) -> None: + def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: self.name = name self._validate_data(data) - self.data = data + self._data = data self.description = description # Post-push attributes self.datadog_dataset_id = None def __iter__(self) -> Iterator[Dict[str, Any]]: - return iter(self.data) + return iter(self._data) def __len__(self) -> int: - return len(self.data) + return len(self._data) def __getitem__(self, index: int) -> Dict[str, Any]: - return self.data[index] + return self._data[index] def __repr__(self) -> str: header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n" @@ -80,13 +78,11 @@ def format_entries(entries): formatted_rows.append(separator) return "\n".join(formatted_rows) - if len(self.data) <= 4: - entries = format_entries(enumerate(self.data)) + if len(self._data) <= 4: + entries = format_entries(enumerate(self._data)) else: - first_two = format_entries(enumerate(self.data[:2])) - last_two = format_entries( - enumerate(self.data[-2:], start=len(self.data) - 2) - ) + first_two = format_entries(enumerate(self._data[:2])) + last_two = format_entries(enumerate(self._data[-2:], start=len(self._data) - 2)) entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}" @@ -119,12 +115,8 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None: # Check that 'input' and 'expected_output' are flat dictionaries for key in ["input", "expected_output"]: - if key in row and any( - isinstance(value, dict) for value in row[key].values() - ): - raise ValueError( - f"'{key}' must be a flat dictionary (no nested dictionaries)." - ) + if key in row and any(isinstance(value, dict) for value in row[key].values()): + raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).") @classmethod def from_datadog(cls, name: str) -> "Dataset": @@ -152,9 +144,7 @@ def from_datadog(cls, name: str) -> "Dataset": # Get dataset ID encoded_name = quote(name) url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request( - conn, headers, "GET", url, context="Dataset lookup" - ) + response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") datasets = response_data.get("data", []) if not datasets: @@ -164,9 +154,7 @@ def from_datadog(cls, name: str) -> "Dataset": # Get dataset records url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - records_data = _make_request( - conn, headers, "GET", url, context="Records lookup" - ) + records_data = _make_request(conn, headers, "GET", url, context="Records lookup") # Transform records into the expected format class_records = [] @@ -209,9 +197,7 @@ def push(self) -> Dict[str, str]: # Check if dataset exists encoded_name = quote(self.name) url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request( - conn, headers, "GET", url, context="Dataset lookup" - ) + response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") datasets = response_data.get("data", []) if not datasets: @@ -244,9 +230,7 @@ def push(self) -> Dict[str, str]: ) # Add records to the dataset - records_payload = { - "data": {"type": "datasets", "attributes": {"records": self.data}} - } + records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}} url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" _make_request( conn, @@ -259,7 +243,7 @@ def push(self) -> Dict[str, str]: print(f"✓ Successfully uploaded dataset '{self.name}'") print(f" • Dataset ID: {dataset_id}") - print(f" • Records uploaded: {len(self.data)}") + print(f" • Records uploaded: {len(self._data)}") return self @@ -318,9 +302,7 @@ def format_evaluator(evaluator: Callable) -> str: return f"{evaluator.__name__}" evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators] - evaluators = ( - ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" - ) + evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" table = ( f"{separator}\n" @@ -349,9 +331,7 @@ def _validate_tags(self) -> None: """ for tag in self.tags: if not isinstance(tag, str) or ":" not in tag: - raise ValueError( - f"Invalid tag format: {tag}. Tags should be in the format 'key:value'." - ) + raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.") def run(self, _jobs: int = 10) -> None: """Execute the experiment tasks on the dataset without performing evaluations. @@ -410,10 +390,7 @@ def process_row(idx_row): } with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - future_to_idx = { - executor.submit(process_row, (idx, row)): idx - for idx, row in enumerate(self.dataset) - } + future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} # Process as they complete while maintaining order completed = 0 @@ -427,9 +404,7 @@ def process_row(idx_row): progress = int(50 * completed / total_rows) bar = f"{'=' * progress}{' ' * (50 - progress)}" percent = int(100 * completed / total_rows) - sys.stdout.write( - f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})" - ) + sys.stdout.write(f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})") sys.stdout.flush() self.outputs = outputs_buffer @@ -471,10 +446,7 @@ def evaluate_output(idx_output): idx_in_dataset = output_data["metadata"]["dataset_record_idx"] row = self.dataset[idx_in_dataset] output = output_data["output"] - evaluations = { - evaluator.__name__: evaluator(row, output) - for evaluator in self.evaluators - } + evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} result = { "output": output, @@ -511,6 +483,13 @@ def evaluate_output(idx_output): results_buffer[idx] = future.result()["result"] completed += 1 + # Update progress + progress = int(50 * completed / total_rows) + bar = f"{'=' * progress}{' ' * (50 - progress)}" + percent = int(100 * completed / total_rows) + sys.stdout.write(f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})") + sys.stdout.flush() + results.experiment_rows = results_buffer sys.stdout.write("\n") @@ -519,7 +498,7 @@ def evaluate_output(idx_output): self.results = results return results - def get_results(self) -> 'ExperimentResults': + def get_results(self) -> "ExperimentResults": if not self.has_evaluated: raise ValueError("Evaluations have not been performed yet. Please call eval() after run().") return self.results @@ -563,9 +542,7 @@ def format_entries(entries): dataset_idx = entry["metadata"]["dataset_record_idx"] dataset_entry = self.dataset[dataset_idx] input_lines = format_dict(dataset_entry["input"]) - expected_output_lines = format_dict( - dataset_entry.get("expected_output", {}) - ) + expected_output_lines = format_dict(dataset_entry.get("expected_output", {})) output_lines = format_dict(entry["output"]) evaluations_lines = format_dict(entry.get("evaluations", [])) @@ -638,13 +615,12 @@ def push(self) -> Dict[str, str]: try: # Check if project exists url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" - response_data = _make_request( - conn, headers, "GET", url, context="Project lookup" - ) + response_data = _make_request(conn, headers, "GET", url, context="Project lookup") projects = response_data.get("data", []) if not projects: # Create new project + print(f"Project '{self.experiment.project_name}' not found. Creating it.") project_payload = { "data": { "type": "projects", @@ -670,9 +646,7 @@ def push(self) -> Dict[str, str]: # Check if experiment exists encoded_name = quote(self.experiment.name) url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" - response_data = _make_request( - conn, headers, "GET", url, context="Experiment lookup" - ) + response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup") experiments = response_data.get("data", []) if not experiments: @@ -705,9 +679,7 @@ def push(self) -> Dict[str, str]: # Experiment exists, create a new version version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") new_experiment_name = f"{self.experiment.name}-{version_suffix}" - print( - f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'." - ) + print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.") experiment_payload = { "data": { "type": "experiments", @@ -738,7 +710,6 @@ def push(self) -> Dict[str, str]: metrics = [] for idx, result in enumerate(self.experiment_rows): - span = { "span_id": _make_id(), "project_id": project_id, @@ -749,15 +720,12 @@ def push(self) -> Dict[str, str]: "duration": float(result["metadata"]["duration"] * 1e9), "tags": self.experiment.tags, "status": "ok", - "metrics": { # TODO: Fill in with actual metrics once we have tracing and llm spans - }, + "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans "meta": { "span": {"kind": "experiment"}, "input": self.experiment.dataset[idx]["input"], "output": result["output"], - "expected_output": self.experiment.dataset[idx].get( - "expected_output", {} - ), + "expected_output": self.experiment.dataset[idx].get("expected_output", {}), "error": { "message": result["error"], "stack": None, @@ -770,11 +738,11 @@ def push(self) -> Dict[str, str]: # Add evaluation metrics for metric_name, metric_value in result["evaluations"].items(): timestamp_ms = int(result["metadata"]["timestamp"] * 1000) - + # Check for bool first, since bool is a subclass of int if isinstance(metric_value, bool): metric_type = "categorical" - metric_value = str(metric_value).lower() + metric_value = str(metric_value).lower() elif isinstance(metric_value, (int, float)): metric_type = "score" else: @@ -786,6 +754,7 @@ def push(self) -> Dict[str, str]: "metric_type": metric_type, "timestamp_ms": timestamp_ms, "label": metric_name, + "score_value" if metric_type == "score" else "categorical_value": metric_value, } if metric_type == "score": @@ -803,8 +772,6 @@ def push(self) -> Dict[str, str]: } } - - url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" _make_request( conn, @@ -815,9 +782,7 @@ def push(self) -> Dict[str, str]: context="Publishing results", ) - print( - f"✓ Successfully uploaded experiment results for '{self.experiment.name}'" - ) + print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'") print(f" • Experiment ID: {experiment_id}") print(f" • Spans uploaded: {len(spans)}") print(f" • Metrics uploaded: {len(metrics)}") @@ -851,7 +816,7 @@ def _make_request( response = conn.getresponse() response_body = response.read() - response_text = response_body.decode('utf-8') + response_text = response_body.decode("utf-8") if response.status >= 400: error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}" @@ -878,13 +843,16 @@ def _make_id() -> str: class DatadogAPIError(Exception): """Raised when there is an error interacting with the Datadog API.""" + def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None): self.status_code = status_code self.response = response super().__init__(message) + class DatadogResponseError(Exception): """Raised when there is an error parsing the response from Datadog.""" + def __init__(self, message: str, raw_response: Optional[str] = None): self.raw_response = raw_response super().__init__(message) @@ -908,37 +876,6 @@ def _validate_api_keys() -> None: ) - -def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: - """Decorator that creates multiple versions by combining all parameter values. - - Args: - **param_dict: Dictionary of parameter names and their possible values. - Values can be single items or lists of possible values. - - Returns: - List[Any]: List of results from calling the decorated function with each parameter combination - """ - def decorator(func): - # Convert single values to lists - processed_params = { - name: [val] if not isinstance(val, (list, tuple)) else val - for name, val in param_dict.items() - } - - # Generate all combinations of parameters - param_names = list(processed_params.keys()) - param_values = [processed_params[name] for name in param_names] - param_combinations = [ - dict(zip(param_names, combo)) - for combo in itertools.product(*param_values) - ] - - # Return list of results from calling function with each combination - return [func(**params) for params in param_combinations] - - return decorator - class Prompt: """A class for rendering templated prompts with variables. @@ -975,17 +912,12 @@ def render(self, **kwargs): return self.template.format(**merged_vars) elif isinstance(self.template, (list, tuple)): return [ - { - k: v.format(**merged_vars) if isinstance(v, str) else v - for k, v in message.items() - } + {k: v.format(**merged_vars) if isinstance(v, str) else v for k, v in message.items()} for message in self.template ] else: raise ValueError("Template must be either a string or a list of message dictionaries") - - - + def __repr__(self): hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8] - return f"Prompt(hash={hash})" \ No newline at end of file + return f"Prompt(hash={hash})" diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py new file mode 100644 index 00000000000..f01551afc17 --- /dev/null +++ b/tests/llmobs/test_llmobs_experiments.py @@ -0,0 +1,51 @@ +import itertools +from typing import Any +from typing import Callable +from typing import Dict +from typing import List +from typing import Union + +from ddtrace.llmobs import Dataset + + +def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: + """Decorator that creates multiple versions by combining all parameter values. + + Args: + **param_dict: Dictionary of parameter names and their possible values. + Values can be single items or lists of possible values. + Returns: + List[Any]: List of results from calling the decorated function with each parameter combination + """ + + def decorator(func): + # Convert single values to lists + processed_params = { + name: [val] if not isinstance(val, (list, tuple)) else val for name, val in param_dict.items() + } + + # Generate all combinations of parameters + param_names = list(processed_params.keys()) + param_values = [processed_params[name] for name in param_names] + param_combinations = [dict(zip(param_names, combo)) for combo in itertools.product(*param_values)] + + # Return list of results from calling function with each combination + return [func(**params) for params in param_combinations] + + return decorator + + +def test_create_dataset(): + dataset = Dataset( + name="geography-dataset", + data=[ + {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}}, + {"input": {"prompt": "capital of Germany?"}, "expected_output": {"response": "Berlin"}}, + {"input": {"prompt": "capital of Japan?"}, "expected_output": {"response": "Tokyo"}}, + {"input": {"prompt": "capital of Canada?"}, "expected_output": {"response": "Ottawa"}}, + # ... more data entries ... + ], + ) + + assert dataset.name == "geography-dataset" + assert dataset[0] == {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}} From 044d696b63a3d0adaaa01c1a73254b0dd3878142 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 5 Nov 2024 11:36:41 -0500 Subject: [PATCH 13/36] fmt --- ddtrace/llmobs/__init__.py | 2 +- ddtrace/llmobs/_experiments.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index 72596f2418e..73429c6713d 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -6,10 +6,10 @@ LLMObs.enable() """ -from ._llmobs import LLMObs from ._experiments import Dataset from ._experiments import Experiment from ._experiments import ExperimentResults +from ._llmobs import LLMObs __all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 156fab1d8cc..4b4a3f4cdb1 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,13 +1,19 @@ +import concurrent.futures from datetime import datetime -from http.client import HTTPSConnection import hashlib +from http.client import HTTPSConnection import json import os -from typing import Any, Callable, Dict, List, Union, Optional, Iterator import sys import time +from typing import Any +from typing import Callable +from typing import Dict +from typing import Iterator +from typing import List +from typing import Optional +from typing import Union from urllib.parse import quote -import concurrent.futures import uuid From ac634fa333dfe4da51846db9560593326653af52 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 5 Nov 2024 17:05:46 -0500 Subject: [PATCH 14/36] Simplify http client, remove stdout printing --- ddtrace/llmobs/_experiments.py | 749 +++++------------- ddtrace/llmobs/_utils.py | 31 + .../appsec/iast/fixtures/propagation_path.py | 3 +- .../experiments/test_dataset_pull.yaml | 136 ++++ .../experiments/test_dataset_pull_dne.yaml | 38 + tests/llmobs/test_llmobs_experiments.py | 40 +- tests/llmobs/test_utils.py | 10 + 7 files changed, 472 insertions(+), 535 deletions(-) create mode 100644 tests/llmobs/cassettes/experiments/test_dataset_pull.yaml create mode 100644 tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 4b4a3f4cdb1..f1998f1a8cb 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,7 +1,5 @@ import concurrent.futures from datetime import datetime -import hashlib -from http.client import HTTPSConnection import json import os import sys @@ -12,13 +10,14 @@ from typing import Iterator from typing import List from typing import Optional -from typing import Union from urllib.parse import quote import uuid +from ._utils import HTTPResponse +from ._utils import http_request -# Constants -BASE_URL = "api.datadoghq.com" + +BASE_URL = "https://api.datadoghq.com" class Dataset: @@ -29,19 +28,17 @@ class Dataset: Attributes: name (str): Name of the dataset - data (List[Dict[str, Any]]): List of records containing input/output pairs description (str): Optional description of the dataset - datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed) """ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: self.name = name + self.description = description self._validate_data(data) self._data = data - self.description = description # Post-push attributes - self.datadog_dataset_id = None + self._datadog_dataset_id = None def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self._data) @@ -52,48 +49,6 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Dict[str, Any]: return self._data[index] - def __repr__(self) -> str: - header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n" - separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+" - - def format_dict(d: Dict[str, Any]) -> List[str]: - def truncate(value: str) -> str: - return (value[:17] + "...") if len(value) > 20 else value - - return [f"{key}: {truncate(str(value))}" for key, value in d.items()] - - def format_entries(entries): - formatted_rows = [] - for i, entry in entries: - input_lines = format_dict(entry["input"]) - expected_output_lines = format_dict(entry.get("expected_output", {})) - - # Determine the maximum number of lines in input and expected_output - max_lines = max(len(input_lines), len(expected_output_lines)) - - # Pad the lists to have the same number of lines - input_lines += [""] * (max_lines - len(input_lines)) - expected_output_lines += [""] * (max_lines - len(expected_output_lines)) - - for j in range(max_lines): - if j == 0: - index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |" - else: - index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |" - formatted_rows.append(index) - formatted_rows.append(separator) - return "\n".join(formatted_rows) - - if len(self._data) <= 4: - entries = format_entries(enumerate(self._data)) - else: - first_two = format_entries(enumerate(self._data[:2])) - last_two = format_entries(enumerate(self._data[-2:], start=len(self._data) - 2)) - entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" - - table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}" - return f"{header}\n{table if entries else 'No entries available.'}\n\n" - def _validate_data(self, data: List[Dict[str, Any]]) -> None: """Validate the format and structure of dataset records. @@ -138,49 +93,39 @@ def from_datadog(cls, name: str) -> "Dataset": ValueError: If the dataset is not found Exception: If there are HTTP errors during the request """ - _validate_api_keys() - conn = HTTPSConnection(BASE_URL) - headers = { - "DD-API-KEY": os.getenv("DD_API_KEY"), - "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json", - } + # Get dataset ID + encoded_name = quote(name) + url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" + resp = exp_http_request("GET", url) + response_data = resp.json() + datasets = response_data.get("data", []) + + if not datasets: + raise ValueError(f"Dataset '{name}' not found") + + dataset_id = datasets[0]["id"] + + # Get dataset records + url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + resp = exp_http_request("GET", url) + records_data = resp.json() + + # Transform records into the expected format + class_records = [] + for record in records_data.get("data", []): + attrs = record.get("attributes", {}) + class_records.append( + { + "input": attrs.get("input", {}), + "expected_output": attrs.get("expected_output", {}), + **attrs.get("metadata", {}), + } + ) - try: - # Get dataset ID - encoded_name = quote(name) - url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") - datasets = response_data.get("data", []) - - if not datasets: - raise ValueError(f"Dataset '{name}' not found") - - dataset_id = datasets[0]["id"] - - # Get dataset records - url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - records_data = _make_request(conn, headers, "GET", url, context="Records lookup") - - # Transform records into the expected format - class_records = [] - for record in records_data.get("data", []): - attrs = record.get("attributes", {}) - class_records.append( - { - "input": attrs.get("input", {}), - "expected_output": attrs.get("expected_output", {}), - **attrs.get("metadata", {}), - } - ) - - # Create new dataset instance - dataset = cls(name, class_records) - dataset.datadog_dataset_id = dataset_id - return dataset - - finally: - conn.close() + # Create new dataset instance + dataset = cls(name, class_records) + dataset._datadog_dataset_id = dataset_id + return dataset def push(self) -> Dict[str, str]: """Push the dataset to Datadog. @@ -191,70 +136,44 @@ def push(self) -> Dict[str, str]: - dataset_name: The name of the dataset - record_count: Number of records uploaded """ - _validate_api_keys() - conn = HTTPSConnection(BASE_URL) - headers = { - "DD-API-KEY": os.getenv("DD_API_KEY"), - "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json", - } - - try: - # Check if dataset exists - encoded_name = quote(self.name) - url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup") - datasets = response_data.get("data", []) - - if not datasets: - # Create new dataset - dataset_payload = { - "data": { - "type": "datasets", - "attributes": { - "name": self.name, - "description": self.description, - "metadata": {"team": "ml-obs"}, - }, - } + # Check if dataset exists + encoded_name = quote(self.name) + url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" + resp = exp_http_request("GET", url) + response_data = resp.json() + datasets = response_data.get("data", []) + + if not datasets: + # Create new dataset + dataset_payload = { + "data": { + "type": "datasets", + "attributes": { + "name": self.name, + "description": self.description, + "metadata": {"team": "ml-obs"}, + }, } - response_data = _make_request( - conn, - headers, - "POST", - "/api/unstable/llm-obs/v1/datasets", - body=json.dumps(dataset_payload), - context="Dataset creation", - ) - dataset_id = response_data["data"]["id"] - self.datadog_dataset_id = dataset_id - else: - # Dataset exists, raise error - raise ValueError( - f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. " - "Please use a different name for your dataset." - ) - - # Add records to the dataset - records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}} - url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - _make_request( - conn, - headers, - "POST", - url, - body=json.dumps(records_payload), - context="Adding records", + } + resp = exp_http_request( + "POST", "/api/unstable/llm-obs/v1/datasets", body=json.dumps(dataset_payload).encode("utf-8") + ) + response_data = resp.json() + dataset_id = response_data["data"]["id"] + self._datadog_dataset_id = dataset_id + else: + # Dataset exists, raise error + raise ValueError( + f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. " + "Please use a different name for your dataset." ) - print(f"✓ Successfully uploaded dataset '{self.name}'") - print(f" • Dataset ID: {dataset_id}") - print(f" • Records uploaded: {len(self._data)}") - - return self - - finally: - conn.close() + # Add records to the dataset + records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}} + url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8")) + data = resp.json() + return data class Experiment: @@ -301,26 +220,6 @@ def __init__( self.outputs = [] self.results = None - def __repr__(self) -> str: - separator = f"+{'-' * 20}+{'-' * 50}+" - - def format_evaluator(evaluator: Callable) -> str: - return f"{evaluator.__name__}" - - evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators] - evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available" - - table = ( - f"{separator}\n" - f"| {'Experiment':<18} | {self.name:<48} |\n" - f"{separator}\n" - f"| {'Task':<18} | {self.task.__name__:<48} |\n" - f"| {'Dataset':<18} | {f'{self.dataset.name} (n={len(self.dataset)})':<48} |\n" - f"| {'Evaluators':<18} | {evaluators:<48} |\n" - f"{separator}" - ) - return table - def _validate_tasks(self) -> None: # TODO: Design and implement this pass @@ -405,18 +304,8 @@ def process_row(idx_row): idx = future_to_idx[future] outputs_buffer[idx] = future.result() completed += 1 - - # Update progress - progress = int(50 * completed / total_rows) - bar = f"{'=' * progress}{' ' * (50 - progress)}" - percent = int(100 * completed / total_rows) - sys.stdout.write(f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})") - sys.stdout.flush() - self.outputs = outputs_buffer - sys.stdout.write("\n") - self.has_run = True return self.eval() @@ -527,69 +416,6 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None: self.experiment = experiment self.experiment_rows = [] - def __repr__(self) -> str: - separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+{'-' * 38}+{'-' * 38}+" - - def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]: - if isinstance(d, dict): - - def truncate(value: str) -> str: - return (value[:17] + "...") if len(value) > 20 else value - - return [f"{key}: {truncate(str(value))}" for key, value in d.items()] - elif isinstance(d, list): - return [str(item) for item in d] - else: - return [str(d)] - - def format_entries(entries): - formatted_rows = [] - for i, entry in enumerate(entries): - dataset_idx = entry["metadata"]["dataset_record_idx"] - dataset_entry = self.dataset[dataset_idx] - input_lines = format_dict(dataset_entry["input"]) - expected_output_lines = format_dict(dataset_entry.get("expected_output", {})) - output_lines = format_dict(entry["output"]) - evaluations_lines = format_dict(entry.get("evaluations", [])) - - # Determine the maximum number of lines across all fields - max_lines = max( - len(input_lines), - len(expected_output_lines), - len(output_lines), - len(evaluations_lines), - ) - - # Pad the lists to have the same number of lines - input_lines += [""] * (max_lines - len(input_lines)) - expected_output_lines += [""] * (max_lines - len(expected_output_lines)) - output_lines += [""] * (max_lines - len(output_lines)) - evaluations_lines += [""] * (max_lines - len(evaluations_lines)) - - for j in range(max_lines): - if j == 0: - index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" - else: - index = f"|{'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |" - formatted_rows.append(index) - formatted_rows.append(separator) - return "\n".join(formatted_rows) - - if len(self.experiment_rows) <= 4: - entries = format_entries(self.experiment_rows) - else: - first_two = format_entries(self.experiment_rows[:2]) - last_two = format_entries(self.experiment_rows[-2:]) - entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}" - - table = ( - f"{separator}\n" - f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n" - f"{separator}\n" - f"{entries}" - ) - return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n" - def __iter__(self) -> Iterator[Dict[str, Any]]: return iter(self.experiment_rows) @@ -608,234 +434,155 @@ def push(self) -> Dict[str, str]: - experiment_name: The name of the experiment - span_count: Number of spans uploaded """ - _validate_api_keys() - - # Initialize connection and headers - conn = HTTPSConnection(BASE_URL) - headers = { - "DD-API-KEY": os.getenv("DD_API_KEY"), - "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), - "Content-Type": "application/json", - } - - try: - # Check if project exists - url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" - response_data = _make_request(conn, headers, "GET", url, context="Project lookup") - projects = response_data.get("data", []) - - if not projects: - # Create new project - print(f"Project '{self.experiment.project_name}' not found. Creating it.") - project_payload = { - "data": { - "type": "projects", - "attributes": { - "name": self.experiment.project_name, - "description": "", - "metadata": {"team": "ml-obs"}, - }, - } - } - response_data = _make_request( - conn, - headers, - "POST", - "/api/unstable/llm-obs/v1/projects", - body=json.dumps(project_payload), - context="Project creation", - ) - project_id = response_data["data"]["id"] - else: - project_id = projects[0]["id"] - - # Check if experiment exists - encoded_name = quote(self.experiment.name) - url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" - response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup") - experiments = response_data.get("data", []) - - if not experiments: - # Create new experiment - experiment_payload = { - "data": { - "type": "experiments", - "attributes": { - "name": self.experiment.name, - "description": self.experiment.description, - "dataset_id": self.experiment.dataset.datadog_dataset_id, - "project_id": project_id, - "metadata": { - "tags": self.experiment.tags, - **self.experiment.metadata, - }, - }, - } - } - response_data = _make_request( - conn, - headers, - "POST", - "/api/unstable/llm-obs/v1/experiments", - body=json.dumps(experiment_payload), - context="Experiment creation", - ) - experiment_id = response_data["data"]["id"] - else: - # Experiment exists, create a new version - version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - new_experiment_name = f"{self.experiment.name}-{version_suffix}" - print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.") - experiment_payload = { - "data": { - "type": "experiments", - "attributes": { - "name": new_experiment_name, - "description": self.experiment.description, - "dataset_id": self.experiment.dataset.datadog_dataset_id, - "project_id": project_id, - "metadata": { - "tags": self.experiment.tags, - **self.experiment.metadata, - }, - }, - } + # Check if project exists + url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" + resp = exp_http_request("GET", url) + response_data = resp.json() + projects = response_data.get("data", []) + if not projects: + # Create new project + project_payload = { + "data": { + "type": "projects", + "attributes": { + "name": self.experiment.project_name, + "description": "", + "metadata": {"team": "ml-obs"}, + }, } - response_data = _make_request( - conn, - headers, - "POST", - "/api/unstable/llm-obs/v1/experiments", - body=json.dumps(experiment_payload), - context="Experiment version creation", - ) - experiment_id = response_data["data"]["id"] - self.experiment.name = new_experiment_name - - spans = [] - metrics = [] - - for idx, result in enumerate(self.experiment_rows): - span = { - "span_id": _make_id(), - "project_id": project_id, - "experiment_id": experiment_id, - "dataset_id": self.experiment.dataset.datadog_dataset_id, - "dataset_record_id": _make_id(), - "start_ns": int(result["metadata"]["timestamp"] * 1e9), - "duration": float(result["metadata"]["duration"] * 1e9), - "tags": self.experiment.tags, - "status": "ok", - "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans - "meta": { - "span": {"kind": "experiment"}, - "input": self.experiment.dataset[idx]["input"], - "output": result["output"], - "expected_output": self.experiment.dataset[idx].get("expected_output", {}), - "error": { - "message": result["error"], - "stack": None, - "type": None, + } + resp = exp_http_request( + "POST", + "/api/unstable/llm-obs/v1/projects", + body=json.dumps(project_payload).encode("utf-8"), + ) + response_data = resp.json() + project_id = response_data["data"]["id"] + else: + project_id = projects[0]["id"] + + # Check if experiment exists + encoded_name = quote(self.experiment.name) + url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" + resp = exp_http_request("GET", url) + response_data = resp.json() + experiments = response_data.get("data", []) + + if not experiments: + # Create new experiment + experiment_payload = { + "data": { + "type": "experiments", + "attributes": { + "name": self.experiment.name, + "description": self.experiment.description, + "dataset_id": self.experiment.dataset._datadog_dataset_id, + "project_id": project_id, + "metadata": { + "tags": self.experiment.tags, + **self.experiment.metadata, }, }, } - spans.append(span) - - # Add evaluation metrics - for metric_name, metric_value in result["evaluations"].items(): - timestamp_ms = int(result["metadata"]["timestamp"] * 1000) - - # Check for bool first, since bool is a subclass of int - if isinstance(metric_value, bool): - metric_type = "categorical" - metric_value = str(metric_value).lower() - elif isinstance(metric_value, (int, float)): - metric_type = "score" - else: - metric_type = "categorical" - metric_value = str(metric_value) - - metric = { - "span_id": span["span_id"], - "metric_type": metric_type, - "timestamp_ms": timestamp_ms, - "label": metric_name, - "score_value" if metric_type == "score" else "categorical_value": metric_value, - } - - if metric_type == "score": - metric["score_value"] = metric_value - else: - metric["categorical_value"] = metric_value - - metrics.append(metric) - - print(metrics) - results_payload = { + } + resp = exp_http_request( + "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") + ) + response_data = resp.json() + experiment_id = response_data["data"]["id"] + else: + # Experiment exists, create a new version + version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + new_experiment_name = f"{self.experiment.name}-{version_suffix}" + experiment_payload = { "data": { "type": "experiments", - "attributes": {"spans": spans, "metrics": metrics}, + "attributes": { + "name": new_experiment_name, + "description": self.experiment.description, + "dataset_id": self.experiment.dataset._datadog_dataset_id, + "project_id": project_id, + "metadata": { + "tags": self.experiment.tags, + **self.experiment.metadata, + }, + }, } } - - url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" - _make_request( - conn, - headers, - "POST", - url, - body=json.dumps(results_payload), - context="Publishing results", + resp = exp_http_request( + "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") ) + response_data = resp.json() + experiment_id = response_data["data"]["id"] + self.experiment.name = new_experiment_name + + spans = [] + metrics = [] + for idx, result in enumerate(self.experiment_rows): + span = { + "span_id": _make_id(), + "project_id": project_id, + "experiment_id": experiment_id, + "dataset_id": self.experiment.dataset._datadog_dataset_id, + "dataset_record_id": _make_id(), + "start_ns": int(result["metadata"]["timestamp"] * 1e9), + "duration": float(result["metadata"]["duration"] * 1e9), + "tags": self.experiment.tags, + "status": "ok", + "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans + "meta": { + "span": {"kind": "experiment"}, + "input": self.experiment.dataset[idx]["input"], + "output": result["output"], + "expected_output": self.experiment.dataset[idx].get("expected_output", {}), + "error": { + "message": result["error"], + "stack": None, + "type": None, + }, + }, + } + spans.append(span) + + # Add evaluation metrics + for metric_name, metric_value in result["evaluations"].items(): + timestamp_ms = int(result["metadata"]["timestamp"] * 1000) + + # Check for bool first, since bool is a subclass of int + if isinstance(metric_value, bool): + metric_type = "categorical" + metric_value = str(metric_value).lower() + elif isinstance(metric_value, (int, float)): + metric_type = "score" + else: + metric_type = "categorical" + metric_value = str(metric_value) + + metric = { + "span_id": span["span_id"], + "metric_type": metric_type, + "timestamp_ms": timestamp_ms, + "label": metric_name, + "score_value" if metric_type == "score" else "categorical_value": metric_value, + } - print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'") - print(f" • Experiment ID: {experiment_id}") - print(f" • Spans uploaded: {len(spans)}") - print(f" • Metrics uploaded: {len(metrics)}") - - return self - - finally: - conn.close() - - -def _make_request( - conn: HTTPSConnection, - headers: Dict[str, Any], - method: str, - url: str, - body: Optional[Any] = None, - context: str = "", -) -> Dict[str, Any]: - """Make an HTTP request to the Datadog API. - - Raises: - DatadogAPIError: If the request fails or returns an error status - DatadogResponseError: If the response contains invalid JSON - """ - if method == "GET": - conn.request(method, url, headers=headers) - else: - if body is not None and isinstance(body, str): - body = body.encode("utf-8") - conn.request(method, url, body=body, headers=headers) - - response = conn.getresponse() - response_body = response.read() - response_text = response_body.decode("utf-8") + if metric_type == "score": + metric["score_value"] = metric_value + else: + metric["categorical_value"] = metric_value - if response.status >= 400: - error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}" - raise DatadogAPIError(error_message, status_code=response.status, response=response_text) + metrics.append(metric) - if not response_body: - return {} + results_payload = { + "data": { + "type": "experiments", + "attributes": {"spans": spans, "metrics": metrics}, + } + } - try: - return json.loads(response_body) - except json.JSONDecodeError: - error_message = f"Invalid JSON response during {context}. Status: {response.status}" - raise DatadogResponseError(error_message, raw_response=response_text) + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" + exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) + return self def _make_id() -> str: @@ -847,29 +594,8 @@ def _make_id() -> str: return uuid.uuid4().hex -class DatadogAPIError(Exception): - """Raised when there is an error interacting with the Datadog API.""" - - def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None): - self.status_code = status_code - self.response = response - super().__init__(message) - - -class DatadogResponseError(Exception): - """Raised when there is an error parsing the response from Datadog.""" - - def __init__(self, message: str, raw_response: Optional[str] = None): - self.raw_response = raw_response - super().__init__(message) - - -def _validate_api_keys() -> None: - """Validate that required Datadog API keys are set in environment variables. - - Raises: - ValueError: If any required API keys are missing from environment variables - """ +def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTTPResponse: + """Make an HTTP request to the Datadog experiments API.""" missing_keys = [] for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]: if not os.getenv(key): @@ -881,49 +607,10 @@ def _validate_api_keys() -> None: "Please set these environment variables before pushing to Datadog." ) - -class Prompt: - """A class for rendering templated prompts with variables. - - Supports both simple string templates and structured chat-like templates. - - Attributes: - template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries - variables (dict): Default variables to use when rendering the template - """ - - def __init__(self, template, variables=None): - """Initialize a new Prompt. - - Args: - template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries - variables (dict, optional): Default variables to use when rendering the template. Defaults to {}. - """ - self.template = template - self.variables = variables or {} - - def render(self, **kwargs): - """Render the template with provided variables. - - Args: - **kwargs: Additional variables to use when rendering the template. - These override any default variables with the same name. - - Returns: - Union[str, List[Dict[str, str]]]: The rendered template with all variables substituted - """ - merged_vars = {**self.variables, **kwargs} - - if isinstance(self.template, str): - return self.template.format(**merged_vars) - elif isinstance(self.template, (list, tuple)): - return [ - {k: v.format(**merged_vars) if isinstance(v, str) else v for k, v in message.items()} - for message in self.template - ] - else: - raise ValueError("Template must be either a string or a list of message dictionaries") - - def __repr__(self): - hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8] - return f"Prompt(hash={hash})" + headers = { + "DD-API-KEY": os.getenv("DD_API_KEY"), + "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), + "Content-Type": "application/json", + } + url = BASE_URL + url + return http_request(method, url, headers=headers, body=body) diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index 7dd17ea94f3..667dcdd7fd5 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -1,7 +1,9 @@ +import http.client import json from typing import Dict from typing import Optional from typing import Union +import urllib.request import ddtrace from ddtrace import Span @@ -163,3 +165,32 @@ def safe_json(obj): return json.dumps(obj, skipkeys=True, default=_unserializable_default_repr) except Exception: log.error("Failed to serialize object to JSON.", exc_info=True) + + +class HTTPResponse: + def __init__(self, resp: http.client.HTTPResponse) -> None: + self._resp = resp + + @property + def status_code(self) -> int: + return self._resp.status + + def json(self) -> dict: + """Return the JSON content of the response. + + Note that this method can only be called once as the response content is read and consumed. + """ + data = self._resp.read() + print(data) + return json.loads(data.decode("utf-8")) + + +def http_request( + method: str, url: str, headers: Optional[Dict[str, str]] = None, body: Optional[bytes] = None +) -> HTTPResponse: + # Create the request object + req = urllib.request.Request(url, data=body, method=method) + if headers: + for key, value in headers.items(): + req.add_header(key, value) + return HTTPResponse(urllib.request.urlopen(req)) diff --git a/tests/appsec/iast/fixtures/propagation_path.py b/tests/appsec/iast/fixtures/propagation_path.py index 7dcaa737995..d645e781e3f 100644 --- a/tests/appsec/iast/fixtures/propagation_path.py +++ b/tests/appsec/iast/fixtures/propagation_path.py @@ -2,13 +2,12 @@ CAVEAT: the line number is important to some IAST tests, be careful to modify this file and update the tests if you make some changes """ +import _io import asyncio import os import re import sys -import _io - ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) diff --git a/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml b/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml new file mode 100644 index 00000000000..3c23f1d0eb0 --- /dev/null +++ b/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml @@ -0,0 +1,136 @@ +interactions: +- request: + body: null + headers: + Connection: + - close + Content-Type: + - application/json + Host: + - api.datadoghq.com + User-Agent: + - Python-urllib/3.12 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter[name]=meal-calorie-dataset-multilingual-3 + response: + body: + string: '{"data":[{"id":"f61953f8-43de-4a99-bcaf-0b145471dcb0","type":"datasets","attributes":{"author":{"id":"8855edca-05df-11ec-bea0-da7ad0900002"},"created_at":"2024-11-05T15:57:29.145601Z","description":"A + dataset of meals and their expected calories","metadata":{"team":"ml-obs"},"name":"meal-calorie-dataset-multilingual-3","updated_at":"2024-11-05T15:57:29.222045Z"}}]}' + headers: + Connection: + - close + Content-Length: + - '367' + Content-Type: + - application/vnd.api+json + Date: + - Tue, 05 Nov 2024 21:46:56 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +- request: + body: null + headers: + Connection: + - close + Content-Type: + - application/json + Host: + - api.datadoghq.com + User-Agent: + - Python-urllib/3.12 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/f61953f8-43de-4a99-bcaf-0b145471dcb0/records + response: + body: + string: "{\"data\":[{\"id\":\"92ada783-c6c7-42c3-97c7-a64a2341aadd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1100},\"input\":{\"user_input\":\"had + a big mac with medium fries and a coke for lunch\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"450af99c-c56e-4b81-a466-eb1ec69b7ad4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":416},\"input\":{\"user_input\":\"breakfast: + 2 eggs, toast with butter, and black coffee\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"7cd8db38-3532-4fbc-954c-1407814b79e7\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":535},\"input\":{\"user_input\":\"grilled + chicken breast with rice and steamed broccoli\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"0db67626-ecb4-4706-b395-79e4f3e4f635\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"turkey + sandwich on wheat with lettuce and mayo\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"10adbc55-f928-4a54-b6d7-0ef24a9c4436\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":325},\"input\":{\"user_input\":\"bowl + of cheerios with 2% milk and a banana\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"1c930cda-8327-4058-94eb-c52a6926472a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":810},\"input\":{\"user_input\":\"6-inch + subway turkey sub with chips and cookie\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f544f506-907c-4ecb-a3da-f61a98000cec\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":720},\"input\":{\"user_input\":\"chipotle + bowl with chicken, rice, beans, and guac\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"94dd5c6c-5272-4aeb-89f4-46726c5c9a70\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":640},\"input\":{\"user_input\":\"salmon + fillet with quinoa and asparagus\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"c7171013-ef82-4a73-811d-70ab294bebb3\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":520},\"input\":{\"user_input\":\"chicken + caesar salad with croutons\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"04cdfe94-3b3c-4047-9532-99db98e02dd8\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":495},\"input\":{\"user_input\":\"peanut + butter and jelly sandwich with apple\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"63cd9fe2-9ed3-4ac2-99a0-3953c54a0248\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":2800},\"input\":{\"user_input\":\"omg + just demolished a whole pizza by myself \U0001F355\U0001F60B pepperoni + extra + cheese\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b66641b9-e5fe-402a-bfe0-161243d68ac3\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":535},\"input\":{\"user_input\":\"post-workout + protein shake w banana n pb \U0001F4AA\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"12266e11-d4a7-4349-b144-d7db9fee9bbf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":486},\"input\":{\"user_input\":\"brunch + goals! \U0001F60D avocado toast + poached eggs + mimosa #sundayfunday\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"365d6a99-5a87-496a-9a4d-1a5c1e942c86\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1000},\"input\":{\"user_input\":\"living + my best life with this massive burrito \U0001F32F\u2728\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f3f1f98a-1e87-4f09-9193-ec48580e21cd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":420},\"input\":{\"user_input\":\"friday + night ice cream run! \U0001F366 got a waffle cone w/ 2 scoops\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4067adbd-c3ca-4220-bee8-7c767f8373a4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":327},\"input\":{\"user_input\":\"meal + prep sunday done right! \U0001F957 chicken + sweet potato + kale\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"5f332a5f-1805-4ca3-b4dc-edd664b9dcd0\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1400},\"input\":{\"user_input\":\"cant + believe i ate this whole bag of doritos \U0001F631 #noregrets\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"59d278f5-2557-4b45-8900-e15b9cb50654\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":450},\"input\":{\"user_input\":\"smoothie + bowl szn \U0001F353 topped w granola and coconut #healthy\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b75f753e-7629-4156-b5d7-52567d690c3d\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":925},\"input\":{\"user_input\":\"date + night vibes \U0001F35D pasta carbonara + garlic bread + wine\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"8f39141a-c9a0-4089-8ddf-9913f652d636\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"birthday + cake for breakfast because yolo \U0001F382 #treatyoself\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"03a99f81-05f2-4708-9ee2-ce3432fdf4d9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":550},\"input\":{\"user_input\":\"sushiz + 4 lunch - spicy tuna n californa role\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"14135673-a65e-4680-ad67-a90402cc25e4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":416},\"input\":{\"user_input\":\"brekkie + - eggs n tosst w/ coffey\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4feaaad7-560c-4fa6-b432-b02644b2d823\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"choclate + milkshak n french friez\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a57c9b1f-b633-4044-a9ca-d7b39015738a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"grilld + cheez n tomato soop\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"48abb37c-cd6c-411b-8727-1e6255ea0c69\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1200},\"input\":{\"user_input\":\"chikn + alfredo pasta w/ garlic bred\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f7f02f81-8fbe-4b3d-8520-c101f88a3837\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":325},\"input\":{\"user_input\":\"cerel + w/ banan n milk 4 brekfast\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"715d71e5-c0ca-466d-825b-9887e64d7dc9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"intermittent + fasting break: huge bowl of pasta #carbload\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"16169dc7-42cd-42ef-a9fe-1a09c4cda987\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":300},\"input\":{\"user_input\":\"clean + eating day 1: grilled fish and steamed vegetables\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a033fdb7-fea6-47cb-9078-884efa832210\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":950},\"input\":{\"user_input\":\"foodie + adventures: trying this amazing wagyu burger \U0001F60D\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"9129656a-fb11-4980-9d27-ec51e6dbc485\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":480},\"input\":{\"user_input\":\"midnight + munchies: instant ramen with egg \U0001F35C\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"67f163c5-8fb7-43ec-aeee-d3ef0807e6ef\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"almuerzo: + 2 tacos de pollo con guacamole y arroz\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"96a4841f-f929-4e00-9a6a-d88b66effa7e\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"had + some ph\u1EDF with extra brisket for dinner\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"90717f0d-df1c-4dc0-a547-a4288790a435\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":950},\"input\":{\"user_input\":\"butter + chicken with naan and rice \U0001F1EE\U0001F1F3\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"3d39ee90-38c1-4b96-ac43-01f0e488f87b\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"authentic + pad thai with shrimp #thaifood\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fb53483f-987e-4798-a02f-d4dde2105a34\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"homemade + sushi rolls - california y spicy tuna\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fcd71b82-e4cb-4120-858a-bfb435a950d6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"cena: + pasta alla carbonara con pancetta\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"c299c62e-abb4-4d4d-8ad8-3e90d0c7cecd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":750},\"input\":{\"user_input\":\"dim + sum brunch: siu mai, har gow, and char siu bao\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d7ac551e-09ad-4a4b-b444-881c9eabd925\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"kebab + plate with hummus and tabouleh\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d15fc64e-a3cb-46c1-905b-25660db665bf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"bibimbap + with extra gochujang \U0001F1F0\U0001F1F7\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a77f58e0-6a33-462d-8cd3-a5f4c45c5749\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":900},\"input\":{\"user_input\":\"enchiladas + verdes con pollo y frijoles\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"0bfdd8fd-7c32-4c45-a860-70c28d243b60\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u4ECA\u5929\u4E2D\u5348\u5403\u4E86\u53C9\u70E7\u996D\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b98c669c-246e-47e8-a671-5aab9682bbb6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"almo\xE7o + hoje foi uma lasanha bem gorda\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"aae1f1a0-ab23-49fa-91b0-2792be8917cf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u0623\u0643\u0644\u062A + \u0627\u0644\u063A\u062F\u0627\u0621 \u0627\u0644\u064A\u0648\u0645 \u0633\u0645\u0643\u0629 + \u0645\u0634\u0648\u064A\u0629 \u0628\u0627\u0644\u0644\u062D\u0645\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"999b9c93-91e5-4d7f-b31f-83d3f886b850\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":500},\"input\":{\"user_input\":\"\u0431\u043E\u0440\u0449 + \u0441 \u0447\u0435\u0441\u043D\u043E\u0447\u043D\u044B\u043C\u0438 \u043F\u0430\u043C\u043F\u0443\u0448\u043A\u0430\u043C\u0438 + \u043D\u0430 \u043E\u0431\u0435\u0434 \"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"5c15fe8f-cbb6-462e-91ed-f19a7c6fc51b\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\u9EBB\u5A46\u8C46\u8150\u914D\u7C73\u996D\uFF0C\u5F88\u8FA3\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"99a89c03-10ea-4627-be2a-c3c7bf4a0cb8\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":900},\"input\":{\"user_input\":\"feijoada + completa com farofa e couve\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fdaec01d-ac8a-491f-b756-67cca9373a5f\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"schnitzel + mit kartoffelsalat und bier \"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"e4c36b98-1853-42d6-92ec-c919a75d0b9c\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"stamppot + boerenkool met rookworst\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"9bf1dfef-bb33-453c-9938-cbed356f5eb9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"nasi + goreng dengan telur dan satay ayam\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"3ef03ee2-6aee-4efc-b02e-84cc8a1bea29\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\uBD88\uACE0\uAE30 + with \uAE40\uCE58 and extra \uBC25\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"2db67005-03d5-44ae-b819-9d838249d45a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u30E9\u30FC\u30E1\u30F3\u7279\u76DB\u308A\u3001\u30C1\u30E3\u30FC\u30B7\u30E5\u30FC\u8FFD\u52A0\u3067\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"2e66c288-a920-47a4-91ba-58dba787b608\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u0926\u093E\u0932 + \u092E\u0916\u0928\u0940 \u0914\u0930 \u092C\u091F\u0930 \u0928\u093E\u0928 + \u0916\u093E\u092F\u093E\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a5b8d3be-a56e-4f0d-bff3-e515410beff6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"coq + au vin avec pur\xE9e de pommes de terre\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d5d8c6b6-4043-41e1-868e-c6b653f40451\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"\u0643\u0633\u0643\u0633 + \u0628\u0644\u062D\u0645 \u0627\u0644\u0636\u0623\u0646 \u0648\u0627\u0644\u062E\u0636\u0631\u0648\u0627\u062A\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f6c22335-7eed-40ce-b25f-99a9b8e3bc97\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":650},\"input\":{\"user_input\":\"\u03B3\u03B5\u03BC\u03B9\u03C3\u03C4\u03AC + \u03BC\u03B5 \u03C4\u03B6\u03B1\u03C4\u03B6\u03AF\u03BA\u03B9 \u03BA\u03B1\u03B9 + \u03C6\u03AD\u03C4\u03B1\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"758a5d1f-ac29-429c-bd0d-cbe086e3b43a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"pierogi + ruskie ze \u015Bmietan\u0105\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4a5d0a55-c7be-46e0-8fd5-64dd84972e2c\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":500},\"input\":{\"user_input\":\"b\xE1nh + x\xE8o v\u1EDBi n\u01B0\u1EDBc m\u1EAFm pha\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"cedbb75a-b3ae-4169-ad55-70a24267b97e\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\u05DE\u05E6\u05D0 + \u05E4\u05DC\u05D0\u05E4\u05DC \u05E2\u05DD \u05D7\u05D5\u05DE\u05D5\u05E1 + \u05D5\u05E1\u05DC\u05D8\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}}]}" + headers: + Connection: + - close + Content-Type: + - application/vnd.api+json + Date: + - Tue, 05 Nov 2024 21:46:57 GMT + Transfer-Encoding: + - chunked + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml b/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml new file mode 100644 index 00000000000..7cc0e636a2c --- /dev/null +++ b/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml @@ -0,0 +1,38 @@ +interactions: +- request: + body: null + headers: + Connection: + - close + Content-Type: + - application/json + Host: + - api.datadoghq.com + User-Agent: + - Python-urllib/3.12 + method: GET + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter[name]=dataset-does-not-exist + response: + body: + string: '{"data":[]}' + headers: + Connection: + - close + Content-Length: + - '11' + Content-Type: + - application/vnd.api+json + Date: + - Tue, 05 Nov 2024 21:57:02 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py index f01551afc17..482890fdf4c 100644 --- a/tests/llmobs/test_llmobs_experiments.py +++ b/tests/llmobs/test_llmobs_experiments.py @@ -1,4 +1,5 @@ import itertools +import os from typing import Any from typing import Callable from typing import Dict @@ -6,6 +7,28 @@ from typing import Union from ddtrace.llmobs import Dataset +import pytest +import vcr + + +# Define a function to scrub the headers you want to remove +def scrub_response_headers(response): + # Remove specific headers + headers_to_remove = ["content-security-policy"] + for header in headers_to_remove: + response["headers"].pop(header, None) + return response + + +@pytest.fixture +def experiments_vcr(): + return vcr.VCR( + cassette_library_dir=os.path.join(os.path.dirname(__file__), "cassettes/experiments"), + record_mode="once", + match_on=["path"], + filter_headers=["DD-API-KEY", "DD-APPLICATION-KEY", "Openai-Api-Key", "Authorization"], + before_record_response=scrub_response_headers, + ) def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable: @@ -43,9 +66,22 @@ def test_create_dataset(): {"input": {"prompt": "capital of Germany?"}, "expected_output": {"response": "Berlin"}}, {"input": {"prompt": "capital of Japan?"}, "expected_output": {"response": "Tokyo"}}, {"input": {"prompt": "capital of Canada?"}, "expected_output": {"response": "Ottawa"}}, - # ... more data entries ... ], ) - assert dataset.name == "geography-dataset" assert dataset[0] == {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}} + + +def test_dataset_pull(experiments_vcr): + with experiments_vcr.use_cassette("test_dataset_pull.yaml"): + dataset = Dataset.from_datadog("meal-calorie-dataset-multilingual-3") + assert len(dataset) > 0 + assert isinstance(dataset[0], dict) + assert "input" in dataset[0] + assert "expected_output" in dataset[0] + + +def test_dataset_pull_dne(experiments_vcr): + with experiments_vcr.use_cassette("test_dataset_pull_dne.yaml"): + with pytest.raises(ValueError): + Dataset.from_datadog("dataset-does-not-exist") diff --git a/tests/llmobs/test_utils.py b/tests/llmobs/test_utils.py index 09f23926e86..2b7011c1fc4 100644 --- a/tests/llmobs/test_utils.py +++ b/tests/llmobs/test_utils.py @@ -1,5 +1,6 @@ import pytest +from ddtrace.llmobs._utils import http_request from ddtrace.llmobs.utils import Documents from ddtrace.llmobs.utils import Messages @@ -99,3 +100,12 @@ def test_documents_dictionary_with_incorrect_value_types(): Documents({"text": "hello", "name": {"key": "value"}}) with pytest.raises(TypeError): Documents([{"text": "hello", "score": "123"}]) + + +def test_http_request(): + response = http_request("GET", "https://httpbin.org/get") + assert response.status_code == 200 + data = response.json() + assert data["url"] == "https://httpbin.org/get" + assert data["args"] == {} + assert data["headers"]["Host"] == "httpbin.org" From d29f08185397ba1876f65a5593ccb059589146f3 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 5 Nov 2024 17:09:51 -0500 Subject: [PATCH 15/36] fmt --- ddtrace/llmobs/_utils.py | 1 - tests/llmobs/test_llmobs_experiments.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index 667dcdd7fd5..898707998ec 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -181,7 +181,6 @@ def json(self) -> dict: Note that this method can only be called once as the response content is read and consumed. """ data = self._resp.read() - print(data) return json.loads(data.decode("utf-8")) diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py index 482890fdf4c..7d3612e5bd8 100644 --- a/tests/llmobs/test_llmobs_experiments.py +++ b/tests/llmobs/test_llmobs_experiments.py @@ -6,10 +6,11 @@ from typing import List from typing import Union -from ddtrace.llmobs import Dataset import pytest import vcr +from ddtrace.llmobs import Dataset + # Define a function to scrub the headers you want to remove def scrub_response_headers(response): From dc119d0894183e30134f28ae4b3eace038de5ddf Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Tue, 5 Nov 2024 17:19:52 -0500 Subject: [PATCH 16/36] more stdout cleanup, http status code checking --- ddtrace/llmobs/_experiments.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index f1998f1a8cb..8ae14343999 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -378,17 +378,8 @@ def evaluate_output(idx_output): results_buffer[idx] = future.result()["result"] completed += 1 - # Update progress - progress = int(50 * completed / total_rows) - bar = f"{'=' * progress}{' ' * (50 - progress)}" - percent = int(100 * completed / total_rows) - sys.stdout.write(f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})") - sys.stdout.flush() - results.experiment_rows = results_buffer - sys.stdout.write("\n") - self.has_evaluated = True self.results = results return results @@ -425,7 +416,7 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Any: return self.experiment_rows[index] - def push(self) -> Dict[str, str]: + def push(self) -> None: """Push the experiment results to Datadog. Returns: @@ -582,7 +573,6 @@ def push(self) -> Dict[str, str]: url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) - return self def _make_id() -> str: @@ -613,4 +603,6 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT "Content-Type": "application/json", } url = BASE_URL + url - return http_request(method, url, headers=headers, body=body) + resp = HTTPResponse(http_request(method, url, headers=headers, body=body)) + if resp.status_code >= 400: + raise ValueError(f"Failed to make request, got status code {resp.status_code}.") From f0182984c2cba36a27fe11ac3e4f404f2ebb086b Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Fri, 8 Nov 2024 15:13:25 -0500 Subject: [PATCH 17/36] Add feedback from sync --- ddtrace/llmobs/__init__.py | 6 +- ddtrace/llmobs/_experiments.py | 736 +++++++++++++++++++++++++++------ ddtrace/llmobs/_utils.py | 41 +- 3 files changed, 653 insertions(+), 130 deletions(-) diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index 73429c6713d..bd382219754 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -9,7 +9,11 @@ from ._experiments import Dataset from ._experiments import Experiment from ._experiments import ExperimentResults +from ._experiments import FileType +from ._experiments import task +from ._experiments import evaluator +from ._experiments import ExperimentGrid from ._llmobs import LLMObs -__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"] +__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator", "ExperimentGrid"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 8ae14343999..fe093aec074 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,23 +1,33 @@ +# TODO: Add error handling + import concurrent.futures from datetime import datetime import json import os -import sys import time -from typing import Any -from typing import Callable -from typing import Dict -from typing import Iterator -from typing import List -from typing import Optional +from typing import Any, Callable, Dict, Iterator, List, Optional +import inspect +from functools import wraps from urllib.parse import quote import uuid +import csv +from enum import Enum +import itertools +import hashlib +import threading from ._utils import HTTPResponse from ._utils import http_request -BASE_URL = "https://api.datadoghq.com" +DD_SITE = os.getenv("DD_SITE", "datadoghq.com") +BASE_URL = f"https://api.{DD_SITE}" + + +class FileType(Enum): + CSV = 'csv' + PARQUET = 'parquet' + JSONL = 'jsonl' class Dataset: @@ -74,13 +84,23 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None: if set(row.keys()) != first_row_keys: raise ValueError("All rows must have the same keys.") + # Validate that 'input' exists and is a dictionary + if 'input' not in row: + raise ValueError("Each row must contain an 'input' field") + if not isinstance(row['input'], dict): + raise ValueError("The 'input' field must be a dictionary") + + # If expected_output exists, validate it's a dictionary + if 'expected_output' in row and not isinstance(row['expected_output'], dict): + raise ValueError("The 'expected_output' field must be a dictionary") + # Check that 'input' and 'expected_output' are flat dictionaries for key in ["input", "expected_output"]: if key in row and any(isinstance(value, dict) for value in row[key].values()): raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).") @classmethod - def from_datadog(cls, name: str) -> "Dataset": + def pull(cls, name: str) -> "Dataset": """Create a dataset from a dataset hosted in Datadog. Args: @@ -127,7 +147,7 @@ def from_datadog(cls, name: str) -> "Dataset": dataset._datadog_dataset_id = dataset_id return dataset - def push(self) -> Dict[str, str]: + def push(self) -> None: """Push the dataset to Datadog. Returns: @@ -173,7 +193,235 @@ def push(self) -> Dict[str, str]: url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8")) data = resp.json() - return data + + @classmethod + def from_csv( + cls, + filepath: str, + name: str, + description: str = "", + delimiter: str = ",", + input_columns: List[str] = None, + expected_output_columns: List[str] = None, + metadata_columns: List[str] = None, + ) -> "Dataset": + if input_columns is None or expected_output_columns is None: + raise ValueError("`input_columns` and `expected_output_columns` must be provided.") + + data = [] + try: + with open(filepath, mode='r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile, delimiter=delimiter) + rows = list(reader) + if not rows: + raise ValueError("CSV file is empty.") + + # Ensure that the specified columns are present + header_columns = reader.fieldnames + missing_input_columns = [col for col in input_columns if col not in header_columns] + missing_output_columns = [col for col in expected_output_columns if col not in header_columns] + missing_metadata_columns = [] + if metadata_columns: + missing_metadata_columns = [col for col in metadata_columns if col not in header_columns] + + if missing_input_columns: + raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") + if missing_output_columns: + raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") + if missing_metadata_columns: + raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}") + + for row in rows: + input_data = {col: row[col] for col in input_columns} + expected_output_data = {col: row[col] for col in expected_output_columns} + metadata = {} + if metadata_columns: + metadata = {col: row[col] for col in metadata_columns} + + data.append({ + 'input': input_data, + 'expected_output': expected_output_data, + **metadata, + }) + except Exception as e: + raise Exception(f"Failed to read CSV file: {e}") + + return cls(name=name, data=data, description=description) + + @classmethod + def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + if input_columns is None or expected_output_columns is None: + raise ValueError("`input_columns` and `expected_output_columns` must be provided.") + + data = [] + try: + with open(filepath, mode='r', encoding='utf-8') as jsonlfile: + for line in jsonlfile: + row = json.loads(line.strip()) + + input_data = {col: row.get(col) for col in input_columns} + expected_output_data = {col: row.get(col) for col in expected_output_columns} + metadata = {} + if metadata_columns: + metadata = {col: row.get(col) for col in metadata_columns} + + data.append({ + 'input': input_data, + 'expected_output': expected_output_data, + **metadata, + }) + + if not data: + raise ValueError("JSONL file is empty.") + + except Exception as e: + raise Exception(f"Failed to read JSONL file: {e}") + + return cls(name=name, data=data, description=description) + + @classmethod + def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + try: + import pandas as pd + except ImportError: + raise ImportError( + "pandas is required to read parquet files. " + "Please install pandas with: pip install pandas" + ) + + if input_columns is None or expected_output_columns is None: + raise ValueError("`input_columns` and `expected_output_columns` must be provided.") + + data = [] + try: + df = pd.read_parquet(filepath) + if df.empty: + raise ValueError("Parquet file is empty.") + + # Ensure that the specified columns are present + missing_input_columns = [col for col in input_columns if col not in df.columns] + missing_output_columns = [col for col in expected_output_columns if col not in df.columns] + missing_metadata_columns = [] + if metadata_columns: + missing_metadata_columns = [col for col in metadata_columns if col not in df.columns] + + if missing_input_columns: + raise ValueError(f"Input columns not found in DataFrame: {missing_input_columns}") + if missing_output_columns: + raise ValueError(f"Expected output columns not found in DataFrame: {missing_output_columns}") + if missing_metadata_columns: + raise ValueError(f"Metadata columns not found in DataFrame: {missing_metadata_columns}") + + for idx, row in df.iterrows(): + input_data = {col: row[col] for col in input_columns} + expected_output_data = {col: row[col] for col in expected_output_columns} + metadata = {} + if metadata_columns: + metadata = {col: row[col] for col in metadata_columns} + + data.append({ + 'input': input_data, + 'expected_output': expected_output_data, + **metadata, + }) + + except Exception as e: + raise Exception(f"Failed to read Parquet file: {e}") + + return cls(name=name, data=data, description=description) + + @classmethod + def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": + if filetype == FileType.CSV: + return cls.from_csv( + filepath=path, + name=name, + description=description, + delimiter=delimiter, + input_columns=input_columns, + expected_output_columns=expected_output_columns, + metadata_columns=metadata_columns, + ) + elif filetype == FileType.JSONL: + return cls.from_jsonl( + filepath=path, + name=name, + description=description, + input_columns=input_columns, + expected_output_columns=expected_output_columns, + metadata_columns=metadata_columns, + ) + elif filetype == FileType.PARQUET: + return cls.from_parquet( + filepath=path, + name=name, + description=description, + input_columns=input_columns, + expected_output_columns=expected_output_columns, + metadata_columns=metadata_columns, + ) + else: + raise ValueError(f"Unsupported file type: {filetype}") + + def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": + """Convert the dataset to a pandas DataFrame. + + Args: + multiindex (bool): If True, expand 'input' and 'expected_output' dictionaries into columns with MultiIndex. + If False, keep 'input' and 'expected_output' as columns containing dictionaries. + + Returns: + pd.DataFrame: DataFrame representation of the dataset. + + Raises: + ImportError: If pandas is not installed. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + "pandas is required to convert dataset to DataFrame. " + "Please install it with `pip install pandas`" + ) + + if multiindex: + # Create a list of flattened dictionaries + flattened_data = [] + for record in self._data: + flat_record = {} + # Handle 'input' fields + for k, v in record.get('input', {}).items(): + flat_record[('input', k)] = v + # Handle 'expected_output' fields + for k, v in record.get('expected_output', {}).items(): + flat_record[('expected_output', k)] = v + # Handle any other top-level fields + for k, v in record.items(): + if k not in ['input', 'expected_output']: + flat_record[('metadata', k)] = v + flattened_data.append(flat_record) + + df = pd.DataFrame(flattened_data) + # Set columns as MultiIndex + df.columns = pd.MultiIndex.from_tuples(df.columns) + return df + else: + # Keep 'input' and 'expected_output' as dicts in the DataFrame + return pd.DataFrame(self._data) + + def export_to_jsonl(self, file_path): + """ + Exports the dataset to a JSONL file. + + Args: + file_path (str): The path to the output JSONL file. + """ + import json + + with open(file_path, 'w') as f: + for record in self._data: + json_line = json.dumps(record) + f.write(json_line + '\n') class Experiment: @@ -205,6 +453,7 @@ def __init__( project_name: str = "-", description: str = "", metadata: Dict[str, Any] = {}, + config: Optional[Dict[str, Any]] = None, ) -> None: self.name = name self.task = task @@ -214,59 +463,51 @@ def __init__( self.project_name = project_name self.description = description self.metadata = metadata + self.config = config + + # Enforce that the task function has the @task decorator + if not hasattr(self.task, '_is_task'): + raise TypeError("Task function must be decorated with @task decorator.") + + # Enforce that all evaluators have the @evaluator decorator + for evaluator_func in self.evaluators: + if not hasattr(evaluator_func, '_is_evaluator'): + raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.") + # Post-run attributes self.has_run = False self.has_evaluated = False self.outputs = [] self.results = None - def _validate_tasks(self) -> None: - # TODO: Design and implement this - pass - - def _validate_evaluators(self) -> None: - # TODO: Design and implement this - pass - - def _validate_tags(self) -> None: - """Validate experiment tags format. - - Raises: - ValueError: If any tag doesn't follow the 'key:value' format - """ - for tag in self.tags: - if not isinstance(tag, str) or ":" not in tag: - raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.") - - def run(self, _jobs: int = 10) -> None: - """Execute the experiment tasks on the dataset without performing evaluations. - - Runs the task function on each dataset record in parallel and stores - the outputs and metadata. - - Args: - _jobs (int, optional): Number of parallel workers. Defaults to 10. - Must be between 1 and 20. - - Raises: - ValueError: If _jobs is not between 1 and 20 - """ + def run_task(self, _jobs: int = 10) -> None: + """Execute the task function on the dataset and store the outputs.""" if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") - self.outputs = [] total_rows = len(self.dataset) + completed = 0 def process_row(idx_row): idx, row = idx_row try: - # Apply the task function to the row + # Extract the input data + input_data = row['input'] + # Apply the task function to the input data with config start_time = time.time() - output = self.task(row) + if getattr(self.task, '_accepts_config', False): + output = self.task(input_data, self.config) + else: + output = self.task(input_data) end_time = time.time() duration = end_time - start_time - return { + # **Ensure output is a dictionary** + if not isinstance(output, dict): + output = {'value': output} + + # Prepare output data + output_data = { "idx": idx, "output": output, "metadata": { @@ -279,8 +520,10 @@ def process_row(idx_row): }, "error": None, } + return output_data + except Exception as e: - return { + output_data = { "idx": idx, "output": None, "metadata": { @@ -293,100 +536,87 @@ def process_row(idx_row): }, "error": str(e), } + return output_data + + # Initialize the progress bar + _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} - # Process as they complete while maintaining order - completed = 0 outputs_buffer = [None] * total_rows for future in concurrent.futures.as_completed(future_to_idx): idx = future_to_idx[future] - outputs_buffer[idx] = future.result() + output_data = future.result() + outputs_buffer[idx] = output_data completed += 1 - self.outputs = outputs_buffer + _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') + self.outputs = outputs_buffer self.has_run = True - return self.eval() - - def eval(self, _jobs: int = 10) -> "ExperimentResults": - """Evaluate the outputs using the provided evaluators. - - Runs the evaluators on each output in parallel and collects evaluations. - - Args: - _jobs (int, optional): Number of parallel workers. Defaults to 10. - Must be between 1 and 20. - - Returns: - ExperimentResults: Object containing the experiment results - - Raises: - ValueError: If _jobs is not between 1 and 20 - ValueError: If the experiment has not been run yet - """ - if not 1 <= _jobs <= 20: - raise ValueError("Number of jobs must be between 1 and 20") - + def run_evaluations(self) -> None: + """Run evaluators on the outputs and store the results.""" if not self.has_run: - raise ValueError("Experiment has not been run yet. Please call run() before eval().") + raise ValueError("Task has not been run yet. Please call run_task() before run_evaluations().") - results = ExperimentResults(self.dataset, self) + self.results = ExperimentResults(self.dataset, self) + results_buffer = [] total_rows = len(self.outputs) + completed = 0 + + # Initialize the progress bar + _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete') - def evaluate_output(idx_output): - idx, output_data = idx_output + for idx, output_data in enumerate(self.outputs): try: - idx_in_dataset = output_data["metadata"]["dataset_record_idx"] - row = self.dataset[idx_in_dataset] + # Retrieve output from output_data output = output_data["output"] - evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators} - + # Get the corresponding dataset row + dataset_row = self.dataset[idx] + input_data = dataset_row.get('input', {}) + expected_output = dataset_row.get('expected_output', {}) + + # Perform evaluation + evaluations = {} + for evaluator in self.evaluators: + evaluation_result = evaluator(expected_output, output, input_data) + evaluations[evaluator.__name__] = evaluation_result + + # Prepare result data result = { "output": output, "evaluations": evaluations, "metadata": output_data["metadata"], "tags": self.tags, - "error": output_data["error"], + "error": None #TODO: Add error handling } - - return {"idx": idx, "result": result} except Exception as e: - return { - "idx": idx, - "result": { - "output": output_data["output"], - "evaluations": {}, - "metadata": output_data["metadata"], - "tags": self.tags, - "error": str(e), - }, + result = { + "output": output_data.get('output'), + "evaluations": {}, + "metadata": output_data["metadata"], + "tags": self.tags, + "error": str(e), } - with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - future_to_idx = { - executor.submit(evaluate_output, (idx, output_data)): idx - for idx, output_data in enumerate(self.outputs) - } - - # Process as they complete while maintaining order - completed = 0 - results_buffer = [None] * total_rows - for future in concurrent.futures.as_completed(future_to_idx): - idx = future_to_idx[future] - results_buffer[idx] = future.result()["result"] - completed += 1 - - results.experiment_rows = results_buffer + results_buffer.append(result) + completed += 1 + _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete') self.has_evaluated = True - self.results = results - return results + self.results.experiment_rows = results_buffer + + def run(self, _jobs: int = 10) -> "ExperimentResults": + """Execute the task and evaluations, returning the results.""" + self.run_task(_jobs=_jobs) + self.run_evaluations() + print() # Move to the next line after completion + return self.results def get_results(self) -> "ExperimentResults": if not self.has_evaluated: - raise ValueError("Evaluations have not been performed yet. Please call eval() after run().") + raise ValueError("Evaluations have not been performed yet. Please call run() or run_evaluations().") return self.results @@ -416,15 +646,18 @@ def __len__(self) -> int: def __getitem__(self, index: int) -> Any: return self.experiment_rows[index] - def push(self) -> None: + def push(self, overwrite: bool = False) -> None: """Push the experiment results to Datadog. - Returns: - Dict[str, str]: Dictionary containing experiment information including: - - experiment_id: The ID of the created experiment - - experiment_name: The name of the experiment - - span_count: Number of spans uploaded + Raises: + ValueError: If the dataset hasn't been pushed to Datadog first """ + if not self.experiment.dataset._datadog_dataset_id: + raise ValueError( + "Dataset has not been pushed to Datadog. " + "Please call dataset.push() before pushing experiment results." + ) + # Check if project exists url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" resp = exp_http_request("GET", url) @@ -472,6 +705,7 @@ def push(self) -> None: "metadata": { "tags": self.experiment.tags, **self.experiment.metadata, + "config": self.experiment.config, }, }, } @@ -496,6 +730,7 @@ def push(self) -> None: "metadata": { "tags": self.experiment.tags, **self.experiment.metadata, + "config": self.experiment.config, }, }, } @@ -574,6 +809,113 @@ def push(self) -> None: url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) + def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": + """Convert the experiment results to a pandas DataFrame, including the experiment config. + + Args: + multiindex (bool): If True, expand nested dictionaries into MultiIndex columns. + If False, keep the nested dictionaries as they are. + + Returns: + pd.DataFrame: A DataFrame representation of the experiment results. + + Raises: + ImportError: If pandas is not installed. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + "pandas is required to convert experiment results to DataFrame. " + "Please install it with `pip install pandas`" + ) + + # Collect data + data = [] + for result in self.experiment_rows: + record = {} + # Get index of the dataset record + idx = result['metadata'].get('dataset_record_idx') + dataset_record = self.dataset[idx] + + if multiindex: + + # Flatten 'input' and 'expected_output' from the dataset + for k, v in dataset_record.get('input', {}).items(): + record[('input', k)] = v + for k, v in dataset_record.get('expected_output', {}).items(): + record[('expected_output', k)] = v + + # Flatten 'output' from the result + output = result.get('output', {}) + if isinstance(output, dict): + for k, v in output.items(): + record[('output', k)] = v + else: + record[('output', 'value')] = output + + # Flatten 'evaluations' from the result + evaluations = result.get('evaluations', {}) + for evaluator_name, evaluation in evaluations.items(): + if isinstance(evaluation, dict): + for k, v in evaluation.items(): + record[('evaluations', evaluator_name, k)] = v + else: + record[('evaluations', evaluator_name)] = evaluation + + # Flatten 'config' from the experiment, if it exists + if self.experiment.config: + for k, v in self.experiment.config.items(): + record[('config', k)] = v + + # Flatten 'metadata' from the result + for k, v in result.get('metadata', {}).items(): + # Skip project_name, experiment_name, and dataset_name + if k not in ['project_name', 'experiment_name', 'dataset_name']: + record[('metadata', k)] = v + + + # Include 'error' if any + error = result.get('error') + if error: + record[('error', 'message')] = error + + else: + # Include config as a dictionary, if it exists + if self.experiment.config: + record['config'] = self.experiment.config + + # Keep nested dictionaries + record['input'] = dataset_record.get('input', {}) + record['expected_output'] = dataset_record.get('expected_output', {}) + record['output'] = result.get('output', {}) + record['evaluations'] = result.get('evaluations', {}) + record['metadata'] = result.get('metadata', {}) + record['tags'] = result.get('tags', []) + record['error'] = result.get('error') + + data.append(record) + + df = pd.DataFrame(data) + if multiindex: + # Set columns as MultiIndex + df.columns = pd.MultiIndex.from_tuples(df.columns) + return df + + def export_to_jsonl(self, file_path): + """ + Exports the experiment results to a JSONL file. + + Args: + file_path (str): The path to the output JSONL file. + """ + import json + + with open(file_path, 'w') as f: + for result in self.experiment_rows: + json_line = json.dumps(result) + f.write(json_line + '\n') + def _make_id() -> str: """Generate a unique identifier. @@ -603,6 +945,164 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT "Content-Type": "application/json", } url = BASE_URL + url - resp = HTTPResponse(http_request(method, url, headers=headers, body=body)) + resp = http_request(method, url, headers=headers, body=body) if resp.status_code >= 400: - raise ValueError(f"Failed to make request, got status code {resp.status_code}.") + try: + error_details = resp.json() + error_message = error_details.get('errors', [{}])[0].get('detail', resp.text()) + except Exception: + error_message = resp.text() + raise ValueError(f"Request failed with status code {resp.status_code}: {error_message}") + return resp + + +def task(func): + if func.__name__ == "task": + raise ValueError("Function name 'task' is reserved. Please use a different name for your task function.") + + @wraps(func) + def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Any: + # Call the original function with or without config + if 'config' in inspect.signature(func).parameters: + return func(input, config) + else: + return func(input) + # Enforce signature compliance + sig = inspect.signature(func) + params = sig.parameters + if 'input' not in params: + raise TypeError("Task function must have an 'input' parameter.") + # Set attribute to indicate whether the function accepts config + wrapper._accepts_config = 'config' in params + wrapper._is_task = True # Set attribute to indicate decoration + return wrapper + + +def evaluator(func): + @wraps(func) + def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any] = None) -> Any: + return func(expected_output, output, input) + # Enforce signature compliance + sig = inspect.signature(func) + params = sig.parameters + required_params = ['expected_output', 'output', 'input'] + if not all(param in params for param in required_params): + raise TypeError(f"Evaluator function must have parameters {required_params}.") + wrapper._is_evaluator = True # Set attribute to indicate decoration + return wrapper + + +class ExperimentGrid: + """Class to run a grid of experiments over multiple parameter combinations. + + Attributes: + name (str): Name of the experiment grid. + task (Callable): The task function to execute. + dataset (Dataset): The dataset to use. + evaluators (List[Callable]): List of evaluator functions. + config (Dict[str, List[Any]]): Parameter grid to run over. + tags (List[str]): List of tags. + project_name (str): Name of the project. + description (str): Description of the experiment grid. + metadata (Dict[str, Any]): Metadata dictionary. + experiments (List[Experiment]): List of experiments created. + results (List[ExperimentResults]): List of corresponding results. + """ + + def __init__( + self, + name: str, + task: Callable, + dataset: Dataset, + evaluators: List[Callable], + config: Dict[str, List[Any]], + tags: List[str] = [], + project_name: str = "-", + description: str = "", + metadata: Dict[str, Any] = {}, + ) -> None: + self.name = name + self.task = task + self.dataset = dataset + self.evaluators = evaluators + self.config = config + self.tags = tags + self.project_name = project_name + self.description = description + self.metadata = metadata + self.experiments = [] + self.results = [] + + # Generate all parameter combinations and create experiments + self._generate_experiments() + + def _generate_experiments(self): + keys, values = zip(*self.config.items()) + param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)] + + for params in param_combinations: + # Create config for the experiment + config = params.copy() + + # Compute hash of the config + config_str = json.dumps(config, sort_keys=True) + config_hash = hashlib.md5(config_str.encode('utf-8')).hexdigest() + config_hash_tag = f"config_hash:{config_hash}" + + # Generate a unique name for each experiment + experiment_name = f"{self.name}_" + "_".join(f"{k}_{v}" for k, v in params.items()) + + # Create tags for parameters + param_tags = [f"{k}:{v}" for k, v in params.items()] + [config_hash_tag] + + # Create a new experiment instance with updated config and name + experiment = Experiment( + name=experiment_name, + task=self.task, + dataset=self.dataset, + evaluators=self.evaluators, + tags=self.tags + param_tags, + project_name=self.project_name, + description=self.description, + metadata={**self.metadata, "config": config}, + config=config, + ) + + # Add the experiment to the list without running it + self.experiments.append(experiment) + + def __len__(self): + return len(self.experiments) + + def __getitem__(self, index): + return self.experiments[index] + + # Update the run method to use the pre-generated experiments + def run(self, _jobs: int = 10): + """Run experiments for all combinations of parameters in the grid. + + Args: + _jobs (int): Number of parallel workers for each experiment run. + """ + for experiment in self.experiments: + experiment.run(_jobs=_jobs) + self.results.append(experiment.get_results()) + + return self.results + + def get_all_results(self) -> List[ExperimentResults]: + """Return all results from the experiment grid. + + Returns: + List[ExperimentResults]: A list of results for each experiment. + """ + return self.results + + +def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'): + percent = f"{100 * (iteration / float(total)):.{decimals}f}" + filled_length = int(length * iteration // total) + bar = fill * filled_length + '-' * (length - filled_length) + print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r') + if iteration == total: + print() diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index 898707998ec..8907ebd6265 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -4,6 +4,7 @@ from typing import Optional from typing import Union import urllib.request +from urllib.error import HTTPError import ddtrace from ddtrace import Span @@ -168,28 +169,46 @@ def safe_json(obj): class HTTPResponse: - def __init__(self, resp: http.client.HTTPResponse) -> None: + def __init__(self, resp) -> None: + if resp is None: + raise ValueError("Response object cannot be None") self._resp = resp + self._content = None # Cache the content @property def status_code(self) -> int: - return self._resp.status + if hasattr(self._resp, 'status'): + return self._resp.status + elif hasattr(self._resp, 'code'): + return self._resp.code + elif hasattr(self._resp, 'getcode'): + return self._resp.getcode() + else: + raise AttributeError(f"Could not find status code in response object of type {type(self._resp)}") + + def read(self) -> bytes: + if self._content is None: + self._content = self._resp.read() + return self._content + + def text(self) -> str: + return self.read().decode('utf-8') def json(self) -> dict: - """Return the JSON content of the response. - - Note that this method can only be called once as the response content is read and consumed. - """ - data = self._resp.read() - return json.loads(data.decode("utf-8")) + return json.loads(self.text()) def http_request( method: str, url: str, headers: Optional[Dict[str, str]] = None, body: Optional[bytes] = None ) -> HTTPResponse: + """Make an HTTP request and return an HTTPResponse object.""" # Create the request object req = urllib.request.Request(url, data=body, method=method) if headers: - for key, value in headers.items(): - req.add_header(key, value) - return HTTPResponse(urllib.request.urlopen(req)) + req.headers.update(headers) + try: + response = urllib.request.urlopen(req) + return HTTPResponse(response) + except HTTPError as e: + # Create an HTTPResponse object from the error response + return HTTPResponse(e) From 351cd7a49b45bd59657bbd0205691d41ea2571a2 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Sun, 10 Nov 2024 17:50:40 -0500 Subject: [PATCH 18/36] Add error handling on tasks --- ddtrace/llmobs/_experiments.py | 624 +++++++++++++++++++++------------ 1 file changed, 402 insertions(+), 222 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index fe093aec074..9c6f3f3b3c9 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,4 +1,8 @@ -# TODO: Add error handling +# TODO: Test failures on badly defined evaluators +# TODO: Test workflows for re-evals and publishing results +# TODO: Handle behavior pushing experiment results without dataset +# TODO: Idempotency of push/pull methods +# TODO: Support running on subsets of datasets import concurrent.futures from datetime import datetime @@ -14,11 +18,11 @@ from enum import Enum import itertools import hashlib -import threading from ._utils import HTTPResponse from ._utils import http_request +import ddtrace DD_SITE = os.getenv("DD_SITE", "datadoghq.com") BASE_URL = f"https://api.{DD_SITE}" @@ -117,6 +121,7 @@ def pull(cls, name: str) -> "Dataset": encoded_name = quote(name) url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}" resp = exp_http_request("GET", url) + response_data = resp.json() datasets = response_data.get("data", []) @@ -194,6 +199,9 @@ def push(self) -> None: resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8")) data = resp.json() + # Print url to the dataset in Datadog + print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}") + @classmethod def from_csv( cls, @@ -437,10 +445,13 @@ class Experiment: evaluators (List[Callable]): Functions that evaluate task outputs tags (List[str]): Tags for organizing experiments project_name (str): Name of the project this experiment belongs to + description (str): Description of the experiment + metadata (Dict[str, Any]): Additional metadata for the experiment + config (Optional[Dict[str, Any]]): Configuration for the task has_run (bool): Whether the experiment has been executed has_evaluated (bool): Whether the evaluations have been performed outputs (List[Dict]): Outputs after running the task - results (ExperimentResults): Results after running evaluations + evaluations (List[Dict]): Evaluation results after running evaluators """ def __init__( @@ -478,90 +489,228 @@ def __init__( self.has_run = False self.has_evaluated = False self.outputs = [] - self.results = None + self.evaluations = [] - def run_task(self, _jobs: int = 10) -> None: + def run_task( + self, + _jobs: int = 10, + timeout: Optional[float] = None, + retries: int = 0, + max_delay: float = 60.0, + raise_on_error: bool = False, + ) -> None: """Execute the task function on the dataset and store the outputs.""" if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") + if retries < 0: + raise ValueError("Number of retries must be non-negative") self.outputs = [] total_rows = len(self.dataset) completed = 0 def process_row(idx_row): idx, row = idx_row - try: - # Extract the input data - input_data = row['input'] - # Apply the task function to the input data with config - start_time = time.time() - if getattr(self.task, '_accepts_config', False): - output = self.task(input_data, self.config) - else: - output = self.task(input_data) - end_time = time.time() - duration = end_time - start_time - - # **Ensure output is a dictionary** - if not isinstance(output, dict): - output = {'value': output} - - # Prepare output data - output_data = { - "idx": idx, - "output": output, - "metadata": { - "timestamp": start_time, - "duration": duration, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": None, - } - return output_data - - except Exception as e: - output_data = { - "idx": idx, - "output": None, - "metadata": { - "timestamp": time.time(), - "duration": 0, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": str(e), - } - return output_data + attempt = 0 + delay = 1.0 # Initial delay in seconds + + while attempt <= retries: + try: + # Extract the input data + input_data = row['input'] + start_time = time.time() + + def execute_task(): + if getattr(self.task, '_accepts_config', False): + return self.task(input_data, self.config) + else: + return self.task(input_data) + + # Use ThreadPoolExecutor to enforce timeout + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor: + future = single_executor.submit(execute_task) + output = future.result(timeout=timeout) + + end_time = time.time() + duration = end_time - start_time + + # Ensure output is a dictionary + if not isinstance(output, dict): + output = {'value': output} + + # Prepare output data + output_data = { + "idx": idx, + "output": output, + "metadata": { + "timestamp": start_time, + "duration": duration, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": None, + "stack": None, + "type": None, + } + } + return output_data + + except concurrent.futures.TimeoutError as e: + if raise_on_error: + # Reraise the exception to trigger cancellation + raise Exception(f"TimeoutError in task for row {idx}: {e}") from e + if attempt < retries: + # Exponential backoff and retry + sleep_time = min(delay, max_delay) + time.sleep(sleep_time) + delay *= 2 + attempt += 1 + else: + # All retries exhausted, record the timeout error + output_data = { + "idx": idx, + "output": None, + "metadata": { + "timestamp": time.time(), + "duration": 0, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": "Task timed out", + "stack": None, + "type": "TimeoutError", + } + } + return output_data + + except Exception as e: + if raise_on_error: + # Reraise the exception to trigger cancellation + error_type = type(e).__name__ + raise Exception(f"Exception in task for row {idx}: {error_type}: {e}") from e + if attempt < retries: + # Exponential backoff and retry + sleep_time = min(delay, max_delay) + time.sleep(sleep_time) + delay *= 2 + attempt += 1 + else: + # All retries exhausted, record the error + output_data = { + "idx": idx, + "output": None, + "metadata": { + "timestamp": time.time(), + "duration": 0, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": str(e), + "stack": None, + "type": type(e).__name__, + } + } + return output_data # Initialize the progress bar _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') + # Use a flag to determine if an error occurred + error_occurred = False + error_exception = None + with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} + # Submit the process_row function to the executor for each dataset record + futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} outputs_buffer = [None] * total_rows - for future in concurrent.futures.as_completed(future_to_idx): - idx = future_to_idx[future] - output_data = future.result() - outputs_buffer[idx] = output_data - completed += 1 - _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') + try: + for future in concurrent.futures.as_completed(futures): + idx = futures[future] + try: + output_data = future.result() + outputs_buffer[idx] = output_data + if raise_on_error and output_data['error']['message']: + # An error occurred; cancel all futures + error_occurred = True + error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}") + break + except Exception as e: + outputs_buffer[idx] = { + "idx": idx, + "output": None, + "metadata": { + "timestamp": time.time(), + "duration": 0, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": str(e), + "stack": None, + "type": type(e).__name__, + } + } + if raise_on_error: + # An exception occurred; cancel all futures + error_occurred = True + error_exception = e + break + completed += 1 + _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') + finally: + if error_occurred: + # Cancel all pending futures + for future in futures: + future.cancel() + # Shutdown the executor immediately + executor.shutdown(wait=False) + raise error_exception self.outputs = outputs_buffer self.has_run = True - def run_evaluations(self) -> None: - """Run evaluators on the outputs and store the results.""" + # Log error statistics if any errors occurred + error_count = sum(1 for output in self.outputs if output['error']['message'] is not None) + if error_count > 0: + error_rate = (error_count / total_rows) * 100 + print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") + + def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_error: bool = False) -> "ExperimentResults": + """Run evaluators on the outputs and return ExperimentResults. + + Args: + evaluators (Optional[List[Callable]]): List of evaluators to use. If None, uses the experiment's evaluators. + raise_on_error (bool): If True, raises exceptions encountered during evaluation. + + Returns: + ExperimentResults: A new ExperimentResults instance with the evaluation results. + + Raises: + ValueError: If task has not been run yet + """ if not self.has_run: raise ValueError("Task has not been run yet. Please call run_task() before run_evaluations().") - self.results = ExperimentResults(self.dataset, self) - results_buffer = [] + # Use provided evaluators or fall back to experiment's evaluators + evaluators_to_use = evaluators if evaluators is not None else self.evaluators + + # Validate that all evaluators have the @evaluator decorator + for evaluator_func in evaluators_to_use: + if not hasattr(evaluator_func, '_is_evaluator'): + raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.") + + evaluations = [] total_rows = len(self.outputs) completed = 0 @@ -570,7 +719,7 @@ def run_evaluations(self) -> None: for idx, output_data in enumerate(self.outputs): try: - # Retrieve output from output_data + # Retrieve output from outputs output = output_data["output"] # Get the corresponding dataset row dataset_row = self.dataset[idx] @@ -578,46 +727,60 @@ def run_evaluations(self) -> None: expected_output = dataset_row.get('expected_output', {}) # Perform evaluation - evaluations = {} - for evaluator in self.evaluators: + evaluations_dict = {} + for evaluator in evaluators_to_use: evaluation_result = evaluator(expected_output, output, input_data) - evaluations[evaluator.__name__] = evaluation_result + evaluations_dict[evaluator.__name__] = evaluation_result + + # Store evaluation results + evaluations.append({ + "idx": idx, + "evaluations": evaluations_dict, + "error": None, + }) - # Prepare result data - result = { - "output": output, - "evaluations": evaluations, - "metadata": output_data["metadata"], - "tags": self.tags, - "error": None #TODO: Add error handling - } except Exception as e: - result = { - "output": output_data.get('output'), + if raise_on_error: + raise e + evaluations.append({ + "idx": idx, "evaluations": {}, - "metadata": output_data["metadata"], - "tags": self.tags, - "error": str(e), - } + "error": { + "message": str(e), + "type": type(e).__name__, + "stack": None, + }, + }) - results_buffer.append(result) completed += 1 _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete') - self.has_evaluated = True - self.results.experiment_rows = results_buffer + # Return new ExperimentResults without modifying the experiment's state + return ExperimentResults(self.dataset, self, self.outputs, evaluations) - def run(self, _jobs: int = 10) -> "ExperimentResults": - """Execute the task and evaluations, returning the results.""" - self.run_task(_jobs=_jobs) - self.run_evaluations() - print() # Move to the next line after completion - return self.results + def run( + self, + _jobs: int = 10, + timeout: Optional[float] = None, + retries: int = 0, + max_delay: float = 60.0, + raise_on_error: bool = False, + ) -> "ExperimentResults": + """Execute the task and evaluations, returning the results. - def get_results(self) -> "ExperimentResults": - if not self.has_evaluated: - raise ValueError("Evaluations have not been performed yet. Please call run() or run_evaluations().") - return self.results + Args: + _jobs (int): Number of worker threads. + timeout (float, optional): Time limit for the task execution in seconds. + retries (int): Number of retries for failed tasks. + max_delay (float): Maximum delay between retries in seconds. + + Returns: + ExperimentResults: The results of the experiment. + """ + self.run_task(_jobs=_jobs, timeout=timeout, retries=retries, max_delay=max_delay, raise_on_error=raise_on_error) + experiment_results = self.run_evaluations(raise_on_error=raise_on_error) + print() # Move to the next line after completion + return experiment_results class ExperimentResults: @@ -629,22 +792,122 @@ class ExperimentResults: Attributes: dataset (Dataset): The dataset used in the experiment experiment (Experiment): The experiment that generated these results - experiment_rows (List[Dict]): Results for each processed record + outputs (List[Dict]): Outputs after running the task + evaluations (List[Dict]): Evaluation results after running evaluators """ - def __init__(self, dataset: Dataset, experiment: Experiment) -> None: + def __init__(self, dataset: Dataset, experiment: Experiment, outputs: List[Dict], evaluations: List[Dict]) -> None: self.dataset = dataset self.experiment = experiment - self.experiment_rows = [] + self.outputs = outputs # List of outputs from run_task + self.evaluations = evaluations # List of evaluations from run_evaluations + self.merged_results = self._merge_results() # Merged outputs and evaluations + + def _merge_results(self) -> List[Dict[str, Any]]: + """Merge outputs and evaluations into a single list of results.""" + merged_results = [] + for idx in range(len(self.outputs)): + output_data = self.outputs[idx] + evaluation_data = self.evaluations[idx] + dataset_record = self.dataset[idx] + + merged_result = { + "idx": idx, + "input": dataset_record.get('input', {}), + "expected_output": dataset_record.get('expected_output', {}), + "output": output_data.get('output'), + "evaluations": evaluation_data.get('evaluations', {}), + "metadata": output_data.get('metadata', {}), + "error": output_data.get('error'), + "tags": self.experiment.tags, + } + merged_results.append(merged_result) + return merged_results def __iter__(self) -> Iterator[Dict[str, Any]]: - return iter(self.experiment_rows) + return iter(self.merged_results) def __len__(self) -> int: - return len(self.experiment_rows) + return len(self.merged_results) def __getitem__(self, index: int) -> Any: - return self.experiment_rows[index] + return self.merged_results[index] + + def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": + """Convert the experiment results to a pandas DataFrame, including the experiment config. + + Args: + multiindex (bool): If True, expand nested dictionaries into MultiIndex columns. + If False, keep the nested dictionaries as they are. + + Returns: + pd.DataFrame: A DataFrame representation of the experiment results. + + Raises: + ImportError: If pandas is not installed. + """ + try: + import pandas as pd + except ImportError: + raise ImportError( + "pandas is required to convert experiment results to DataFrame. " + "Please install it with `pip install pandas`" + ) + + data = [] + + for result in self.merged_results: + record = {} + if multiindex: + # Flatten 'input' + for k, v in result['input'].items(): + record[('input', k)] = v + # Flatten 'expected_output' + for k, v in result['expected_output'].items(): + record[('expected_output', k)] = v + # Flatten 'output' + output = result.get('output', {}) + if isinstance(output, dict): + for k, v in output.items(): + record[('output', k)] = v + else: + record[('output', 'value')] = output + # Flatten 'evaluations' + for eval_name, eval_result in result['evaluations'].items(): + if isinstance(eval_result, dict): + for k, v in eval_result.items(): + record[('evaluations', eval_name, k)] = v + else: + record[('evaluations', eval_name)] = eval_result + # Flatten 'metadata' + for k, v in result.get('metadata', {}).items(): + record[('metadata', k)] = v + # Include 'config' from the experiment + if self.experiment.config: + for k, v in self.experiment.config.items(): + record[('config', k)] = v + # Flatten 'error' + error = result['error'] + if error: + record[('error', 'message')] = error.get('message') + record[('error', 'type')] = error.get('type') + record[('error', 'stack')] = error.get('stack') + + else: + # Keep nested structures + record['input'] = result['input'] + record['expected_output'] = result['expected_output'] + record['output'] = result.get('output') + record['evaluations'] = result.get('evaluations') + record['metadata'] = result.get('metadata') + record['config'] = self.experiment.config + record['error'] = result.get('error') + data.append(record) + + df = pd.DataFrame(data) + if multiindex: + df.columns = pd.MultiIndex.from_tuples(df.columns) + return df def push(self, overwrite: bool = False) -> None: """Push the experiment results to Datadog. @@ -728,7 +991,6 @@ def push(self, overwrite: bool = False) -> None: "dataset_id": self.experiment.dataset._datadog_dataset_id, "project_id": project_id, "metadata": { - "tags": self.experiment.tags, **self.experiment.metadata, "config": self.experiment.config, }, @@ -744,35 +1006,42 @@ def push(self, overwrite: bool = False) -> None: spans = [] metrics = [] - for idx, result in enumerate(self.experiment_rows): + for result in self.merged_results: + idx = result['idx'] + merged_result = result + output = merged_result.get('output') + evaluations = merged_result.get('evaluations', {}) + metadata = merged_result.get('metadata', {}) + error = merged_result.get('error', {}) + + # Prepare span data span = { "span_id": _make_id(), "project_id": project_id, "experiment_id": experiment_id, "dataset_id": self.experiment.dataset._datadog_dataset_id, "dataset_record_id": _make_id(), - "start_ns": int(result["metadata"]["timestamp"] * 1e9), - "duration": float(result["metadata"]["duration"] * 1e9), - "tags": self.experiment.tags, - "status": "ok", + "start_ns": int(metadata.get("timestamp", time.time()) * 1e9), + "duration": float(metadata.get("duration", 0) * 1e9), + "status": "ok" if not error else "error", "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans "meta": { "span": {"kind": "experiment"}, - "input": self.experiment.dataset[idx]["input"], - "output": result["output"], - "expected_output": self.experiment.dataset[idx].get("expected_output", {}), + "input": merged_result.get('input', {}), + "output": output, + "expected_output": merged_result.get('expected_output', {}), "error": { - "message": result["error"], - "stack": None, - "type": None, - }, + "message": error.get("message"), + "type": error.get("type"), + "stack": error.get("stack"), + } }, } spans.append(span) # Add evaluation metrics - for metric_name, metric_value in result["evaluations"].items(): - timestamp_ms = int(result["metadata"]["timestamp"] * 1000) + for metric_name, metric_value in evaluations.items(): + timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) # Check for bool first, since bool is a subclass of int if isinstance(metric_value, bool): @@ -792,115 +1061,24 @@ def push(self, overwrite: bool = False) -> None: "score_value" if metric_type == "score" else "categorical_value": metric_value, } - if metric_type == "score": - metric["score_value"] = metric_value - else: - metric["categorical_value"] = metric_value - metrics.append(metric) + # Prepare payload and send to Datadog results_payload = { "data": { "type": "experiments", + "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__], "attributes": {"spans": spans, "metrics": metrics}, } } + print(json.dumps(results_payload, indent=2)) + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) - def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": - """Convert the experiment results to a pandas DataFrame, including the experiment config. - - Args: - multiindex (bool): If True, expand nested dictionaries into MultiIndex columns. - If False, keep the nested dictionaries as they are. - - Returns: - pd.DataFrame: A DataFrame representation of the experiment results. - - Raises: - ImportError: If pandas is not installed. - """ - try: - import pandas as pd - except ImportError: - raise ImportError( - "pandas is required to convert experiment results to DataFrame. " - "Please install it with `pip install pandas`" - ) - - # Collect data - data = [] - for result in self.experiment_rows: - record = {} - # Get index of the dataset record - idx = result['metadata'].get('dataset_record_idx') - dataset_record = self.dataset[idx] - - if multiindex: - - # Flatten 'input' and 'expected_output' from the dataset - for k, v in dataset_record.get('input', {}).items(): - record[('input', k)] = v - for k, v in dataset_record.get('expected_output', {}).items(): - record[('expected_output', k)] = v - - # Flatten 'output' from the result - output = result.get('output', {}) - if isinstance(output, dict): - for k, v in output.items(): - record[('output', k)] = v - else: - record[('output', 'value')] = output - - # Flatten 'evaluations' from the result - evaluations = result.get('evaluations', {}) - for evaluator_name, evaluation in evaluations.items(): - if isinstance(evaluation, dict): - for k, v in evaluation.items(): - record[('evaluations', evaluator_name, k)] = v - else: - record[('evaluations', evaluator_name)] = evaluation - - # Flatten 'config' from the experiment, if it exists - if self.experiment.config: - for k, v in self.experiment.config.items(): - record[('config', k)] = v - - # Flatten 'metadata' from the result - for k, v in result.get('metadata', {}).items(): - # Skip project_name, experiment_name, and dataset_name - if k not in ['project_name', 'experiment_name', 'dataset_name']: - record[('metadata', k)] = v - - - # Include 'error' if any - error = result.get('error') - if error: - record[('error', 'message')] = error - - else: - # Include config as a dictionary, if it exists - if self.experiment.config: - record['config'] = self.experiment.config - - # Keep nested dictionaries - record['input'] = dataset_record.get('input', {}) - record['expected_output'] = dataset_record.get('expected_output', {}) - record['output'] = result.get('output', {}) - record['evaluations'] = result.get('evaluations', {}) - record['metadata'] = result.get('metadata', {}) - record['tags'] = result.get('tags', []) - record['error'] = result.get('error') - - data.append(record) - - df = pd.DataFrame(data) - if multiindex: - # Set columns as MultiIndex - df.columns = pd.MultiIndex.from_tuples(df.columns) - return df + # Print URL to the experiment in Datadog + print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}") def export_to_jsonl(self, file_path): """ @@ -912,7 +1090,7 @@ def export_to_jsonl(self, file_path): import json with open(file_path, 'w') as f: - for result in self.experiment_rows: + for result in self.merged_results: json_line = json.dumps(result) f.write(json_line + '\n') @@ -946,6 +1124,8 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT } url = BASE_URL + url resp = http_request(method, url, headers=headers, body=body) + if resp.status_code == 403: + raise ValueError("API key or application key is incorrect.") if resp.status_code >= 400: try: error_details = resp.json() @@ -992,6 +1172,15 @@ def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any] return wrapper +def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'): + percent = f"{100 * (iteration / float(total)):.{decimals}f}" + filled_length = int(length * iteration // total) + bar = fill * filled_length + '-' * (length - filled_length) + print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r') + if iteration == total: + print() + + class ExperimentGrid: """Class to run a grid of experiments over multiple parameter combinations. @@ -1097,12 +1286,3 @@ def get_all_results(self) -> List[ExperimentResults]: List[ExperimentResults]: A list of results for each experiment. """ return self.results - - -def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'): - percent = f"{100 * (iteration / float(total)):.{decimals}f}" - filled_length = int(length * iteration // total) - bar = fill * filled_length + '-' * (length - filled_length) - print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r') - if iteration == total: - print() From 2608ba5734a9d299d62272ecb447ee5240bf7b32 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Sun, 10 Nov 2024 17:54:43 -0500 Subject: [PATCH 19/36] fix import --- tests/appsec/iast/fixtures/propagation_path.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/appsec/iast/fixtures/propagation_path.py b/tests/appsec/iast/fixtures/propagation_path.py index d645e781e3f..7dcaa737995 100644 --- a/tests/appsec/iast/fixtures/propagation_path.py +++ b/tests/appsec/iast/fixtures/propagation_path.py @@ -2,12 +2,13 @@ CAVEAT: the line number is important to some IAST tests, be careful to modify this file and update the tests if you make some changes """ -import _io import asyncio import os import re import sys +import _io + ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) From 5cbfd70a57958f9a6fc7a3f78cb146f532f2c606 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Sun, 10 Nov 2024 18:43:42 -0500 Subject: [PATCH 20/36] docstring --- ddtrace/llmobs/_experiments.py | 90 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 9c6f3f3b3c9..1a30e3c0780 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -213,6 +213,24 @@ def from_csv( expected_output_columns: List[str] = None, metadata_columns: List[str] = None, ) -> "Dataset": + """Create a Dataset from a CSV file. + + Args: + filepath: Path to the CSV file + name: Name of the dataset + description: Optional description of the dataset + delimiter: CSV delimiter character, defaults to comma + input_columns: List of column names to use as input data + expected_output_columns: List of column names to use as expected output data + metadata_columns: Optional list of column names to include as metadata + + Returns: + Dataset: A new Dataset instance containing the CSV data + + Raises: + ValueError: If input_columns or expected_output_columns are not provided + Exception: If there are issues reading the CSV file + """ if input_columns is None or expected_output_columns is None: raise ValueError("`input_columns` and `expected_output_columns` must be provided.") @@ -258,6 +276,23 @@ def from_csv( @classmethod def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + """Create a Dataset from a JSONL file. + + Args: + filepath: Path to the JSONL file + name: Name of the dataset + description: Optional description of the dataset + input_columns: List of column names to use as input data + expected_output_columns: List of column names to use as expected output data + metadata_columns: Optional list of column names to include as metadata + + Returns: + Dataset: A new Dataset instance containing the JSONL data + + Raises: + ValueError: If input_columns or expected_output_columns are not provided + Exception: If there are issues reading the JSONL file + """ if input_columns is None or expected_output_columns is None: raise ValueError("`input_columns` and `expected_output_columns` must be provided.") @@ -289,6 +324,25 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum @classmethod def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + """Create a Dataset from a Parquet file. + + Args: + filepath: Path to the Parquet file + name: Name of the dataset + description: Optional description of the dataset + input_columns: List of column names to use as input data + expected_output_columns: List of column names to use as expected output data + metadata_columns: Optional list of column names to include as metadata + + Returns: + Dataset: A new Dataset instance containing the Parquet data + + Raises: + ImportError: If pandas is not installed + ValueError: If input_columns or expected_output_columns are not provided, + if the Parquet file is empty, or if specified columns are missing + Exception: If there are issues reading the Parquet file + """ try: import pandas as pd except ImportError: @@ -340,6 +394,24 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col @classmethod def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": + """Import a dataset from a file. + + Args: + path (str): Path to the input file + filetype (FileType): Type of file to import (CSV, JSONL, or PARQUET) + name (str): Name of the dataset + description (str, optional): Description of the dataset. Defaults to "". + input_columns (List[str], optional): List of column names to use as input data. Required for CSV and PARQUET files. + expected_output_columns (List[str], optional): List of column names to use as expected output data. Required for CSV and PARQUET files. + metadata_columns (List[str], optional): List of column names to include as metadata. Defaults to None. + delimiter (str, optional): Delimiter character for CSV files. Defaults to ",". + + Returns: + Dataset: A new Dataset instance containing the imported data + + Raises: + ValueError: If filetype is not supported or if required columns are missing + """ if filetype == FileType.CSV: return cls.from_csv( filepath=path, @@ -499,7 +571,21 @@ def run_task( max_delay: float = 60.0, raise_on_error: bool = False, ) -> None: - """Execute the task function on the dataset and store the outputs.""" + """Execute the task function on the dataset and store the outputs. + + Args: + _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10. + timeout: Maximum time in seconds to wait for each task execution. + If None, will wait indefinitely. Defaults to None. + retries: Number of retry attempts for failed tasks. Defaults to 0. + max_delay: Maximum delay in seconds between retries using exponential backoff. + Defaults to 60 seconds. + raise_on_error: If True, raises exceptions from failed tasks. If False, stores + errors in the output. Defaults to False. + + Raises: + ValueError: If _jobs is not between 1 and 20, or if retries is negative. + """ if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") if retries < 0: @@ -773,6 +859,8 @@ def run( timeout (float, optional): Time limit for the task execution in seconds. retries (int): Number of retries for failed tasks. max_delay (float): Maximum delay between retries in seconds. + raise_on_error (bool): If True, raises exceptions from failed tasks. If False, stores + errors in the output. Defaults to False. Returns: ExperimentResults: The results of the experiment. From bed12614d0a45ebc9a2b86f956e428d7a9d385f6 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Sun, 10 Nov 2024 18:58:03 -0500 Subject: [PATCH 21/36] Custom Exception classes --- ddtrace/llmobs/_experiments.py | 60 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 1a30e3c0780..c9b0846b604 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -269,8 +269,14 @@ def from_csv( 'expected_output': expected_output_data, **metadata, }) + except FileNotFoundError as e: + raise DatasetFileError(f"CSV file not found: {filepath}") from e + except PermissionError as e: + raise DatasetFileError(f"Permission denied when reading CSV file: {filepath}") from e + except csv.Error as e: + raise DatasetFileError(f"Error parsing CSV file: {e}") from e except Exception as e: - raise Exception(f"Failed to read CSV file: {e}") + raise DatasetFileError(f"Unexpected error reading CSV file: {e}") from e return cls(name=name, data=data, description=description) @@ -317,8 +323,14 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum if not data: raise ValueError("JSONL file is empty.") + except FileNotFoundError as e: + raise DatasetFileError(f"JSONL file not found: {filepath}") from e + except PermissionError as e: + raise DatasetFileError(f"Permission denied when reading JSONL file: {filepath}") from e + except json.JSONDecodeError as e: + raise DatasetFileError(f"Error parsing JSONL file: {e}") from e except Exception as e: - raise Exception(f"Failed to read JSONL file: {e}") + raise DatasetFileError(f"Unexpected error reading JSONL file: {e}") from e return cls(name=name, data=data, description=description) @@ -387,8 +399,12 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col **metadata, }) + except FileNotFoundError as e: + raise DatasetFileError(f"Parquet file not found: {filepath}") from e + except PermissionError as e: + raise DatasetFileError(f"Permission denied when reading Parquet file: {filepath}") from e except Exception as e: - raise Exception(f"Failed to read Parquet file: {e}") + raise DatasetFileError(f"Error reading Parquet file: {e}") from e return cls(name=name, data=data, description=description) @@ -485,9 +501,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": # Set columns as MultiIndex df.columns = pd.MultiIndex.from_tuples(df.columns) return df - else: - # Keep 'input' and 'expected_output' as dicts in the DataFrame - return pd.DataFrame(self._data) + return pd.DataFrame(self._data) def export_to_jsonl(self, file_path): """ @@ -608,8 +622,7 @@ def process_row(idx_row): def execute_task(): if getattr(self.task, '_accepts_config', False): return self.task(input_data, self.config) - else: - return self.task(input_data) + return self.task(input_data) # Use ThreadPoolExecutor to enforce timeout with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor: @@ -645,8 +658,8 @@ def execute_task(): except concurrent.futures.TimeoutError as e: if raise_on_error: - # Reraise the exception to trigger cancellation - raise Exception(f"TimeoutError in task for row {idx}: {e}") from e + # Raise specific experiment task error + raise ExperimentTaskError(f"Task timed out after {timeout} seconds", idx, e) if attempt < retries: # Exponential backoff and retry sleep_time = min(delay, max_delay) @@ -667,7 +680,7 @@ def execute_task(): "dataset_name": self.dataset.name, }, "error": { - "message": "Task timed out", + "message": f"Task timed out after {timeout} seconds", "stack": None, "type": "TimeoutError", } @@ -676,9 +689,8 @@ def execute_task(): except Exception as e: if raise_on_error: - # Reraise the exception to trigger cancellation - error_type = type(e).__name__ - raise Exception(f"Exception in task for row {idx}: {error_type}: {e}") from e + # Raise specific experiment task error + raise ExperimentTaskError(str(e), idx, e) if attempt < retries: # Exponential backoff and retry sleep_time = min(delay, max_delay) @@ -1210,8 +1222,8 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"), "Content-Type": "application/json", } - url = BASE_URL + url - resp = http_request(method, url, headers=headers, body=body) + full_url = BASE_URL + url + resp = http_request(method, full_url, headers=headers, body=body) if resp.status_code == 403: raise ValueError("API key or application key is incorrect.") if resp.status_code >= 400: @@ -1233,8 +1245,7 @@ def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> A # Call the original function with or without config if 'config' in inspect.signature(func).parameters: return func(input, config) - else: - return func(input) + return func(input) # Enforce signature compliance sig = inspect.signature(func) params = sig.parameters @@ -1374,3 +1385,16 @@ def get_all_results(self) -> List[ExperimentResults]: List[ExperimentResults]: A list of results for each experiment. """ return self.results + + +class DatasetFileError(Exception): + """Exception raised when there are errors reading or processing dataset files.""" + pass + + +class ExperimentTaskError(Exception): + """Exception raised when a task fails during experiment execution.""" + def __init__(self, message: str, row_idx: int, original_error: Exception = None): + self.row_idx = row_idx + self.original_error = original_error + super().__init__(f"Task failed on row {row_idx}: {message}") From 0928224ce247ebb2287b0bbf9aa61ac98f64d0e9 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 11 Nov 2024 18:29:35 -0500 Subject: [PATCH 22/36] handle duration errors --- ddtrace/llmobs/_experiments.py | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index c9b0846b604..6202bb28074 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,8 +1,6 @@ # TODO: Test failures on badly defined evaluators # TODO: Test workflows for re-evals and publishing results -# TODO: Handle behavior pushing experiment results without dataset # TODO: Idempotency of push/pull methods -# TODO: Support running on subsets of datasets import concurrent.futures from datetime import datetime @@ -614,10 +612,11 @@ def process_row(idx_row): delay = 1.0 # Initial delay in seconds while attempt <= retries: + start_time = time.time() try: # Extract the input data input_data = row['input'] - start_time = time.time() + def execute_task(): if getattr(self.task, '_accepts_config', False): @@ -629,8 +628,6 @@ def execute_task(): future = single_executor.submit(execute_task) output = future.result(timeout=timeout) - end_time = time.time() - duration = end_time - start_time # Ensure output is a dictionary if not isinstance(output, dict): @@ -642,7 +639,7 @@ def execute_task(): "output": output, "metadata": { "timestamp": start_time, - "duration": duration, + "duration": time.time() - start_time, "dataset_record_idx": idx, "project_name": self.project_name, "experiment_name": self.name, @@ -672,8 +669,8 @@ def execute_task(): "idx": idx, "output": None, "metadata": { - "timestamp": time.time(), - "duration": 0, + "timestamp": start_time, + "duration": time.time() - start_time, "dataset_record_idx": idx, "project_name": self.project_name, "experiment_name": self.name, @@ -703,8 +700,8 @@ def execute_task(): "idx": idx, "output": None, "metadata": { - "timestamp": time.time(), - "duration": 0, + "timestamp": start_time, + "duration": time.time() - start_time, "dataset_record_idx": idx, "project_name": self.project_name, "experiment_name": self.name, @@ -733,6 +730,7 @@ def execute_task(): try: for future in concurrent.futures.as_completed(futures): idx = futures[future] + start_time = time.time() try: output_data = future.result() outputs_buffer[idx] = output_data @@ -746,8 +744,8 @@ def execute_task(): "idx": idx, "output": None, "metadata": { - "timestamp": time.time(), - "duration": 0, + "timestamp": start_time, + "duration": time.time() - start_time, "dataset_record_idx": idx, "project_name": self.project_name, "experiment_name": self.name, @@ -970,8 +968,6 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": if isinstance(output, dict): for k, v in output.items(): record[('output', k)] = v - else: - record[('output', 'value')] = output # Flatten 'evaluations' for eval_name, eval_result in result['evaluations'].items(): if isinstance(eval_result, dict): @@ -1004,6 +1000,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": record['error'] = result.get('error') data.append(record) + df = pd.DataFrame(data) if multiindex: df.columns = pd.MultiIndex.from_tuples(df.columns) @@ -1110,17 +1107,22 @@ def push(self, overwrite: bool = False) -> None: idx = result['idx'] merged_result = result output = merged_result.get('output') + input = merged_result.get('input', {}) evaluations = merged_result.get('evaluations', {}) + expected_output = merged_result.get('expected_output', {}) metadata = merged_result.get('metadata', {}) error = merged_result.get('error', {}) - # Prepare span data + # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id + dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest() + span = { "span_id": _make_id(), "project_id": project_id, "experiment_id": experiment_id, "dataset_id": self.experiment.dataset._datadog_dataset_id, - "dataset_record_id": _make_id(), + #TODO: Extract the record id from the dataset for hosted datasets + "dataset_record_id": dataset_record_id, "start_ns": int(metadata.get("timestamp", time.time()) * 1e9), "duration": float(metadata.get("duration", 0) * 1e9), "status": "ok" if not error else "error", @@ -1373,8 +1375,8 @@ def run(self, _jobs: int = 10): _jobs (int): Number of parallel workers for each experiment run. """ for experiment in self.experiments: - experiment.run(_jobs=_jobs) - self.results.append(experiment.get_results()) + results = experiment.run(_jobs=_jobs) + self.results.append(results) return self.results From cac1476b4c4311d71fa34219d9904dfbab10db3e Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 12 Nov 2024 12:30:43 -0500 Subject: [PATCH 23/36] more stuff --- ddtrace/llmobs/_experiments.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 6202bb28074..24899214eb7 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -279,7 +279,7 @@ def from_csv( return cls(name=name, data=data, description=description) @classmethod - def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + def _from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": """Create a Dataset from a JSONL file. Args: @@ -333,7 +333,7 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum return cls(name=name, data=data, description=description) @classmethod - def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": + def _from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": """Create a Dataset from a Parquet file. Args: @@ -407,7 +407,7 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col return cls(name=name, data=data, description=description) @classmethod - def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": + def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": """Import a dataset from a file. Args: @@ -437,7 +437,7 @@ def import_file(cls, path: str, filetype: FileType, name: str, description: str metadata_columns=metadata_columns, ) elif filetype == FileType.JSONL: - return cls.from_jsonl( + return cls._from_jsonl( filepath=path, name=name, description=description, @@ -446,7 +446,7 @@ def import_file(cls, path: str, filetype: FileType, name: str, description: str metadata_columns=metadata_columns, ) elif filetype == FileType.PARQUET: - return cls.from_parquet( + return cls._from_parquet( filepath=path, name=name, description=description, @@ -578,9 +578,9 @@ def __init__( def run_task( self, _jobs: int = 10, - timeout: Optional[float] = None, - retries: int = 0, - max_delay: float = 60.0, + _timeout: Optional[float] = None, + _retries: int = 0, + _max_delay: float = 60.0, raise_on_error: bool = False, ) -> None: """Execute the task function on the dataset and store the outputs. @@ -875,7 +875,7 @@ def run( Returns: ExperimentResults: The results of the experiment. """ - self.run_task(_jobs=_jobs, timeout=timeout, retries=retries, max_delay=max_delay, raise_on_error=raise_on_error) + self.run_task(_jobs=_jobs, _timeout=timeout, _retries=retries, _max_delay=max_delay, raise_on_error=raise_on_error) experiment_results = self.run_evaluations(raise_on_error=raise_on_error) print() # Move to the next line after completion return experiment_results From 9024e14ba3bfe5b4bbdcbc88b48752d87001c048 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Thu, 14 Nov 2024 11:46:08 -0500 Subject: [PATCH 24/36] support polymorphic i/o --- ddtrace/llmobs/_experiments.py | 412 ++++++++++++++++++++++++--------- 1 file changed, 308 insertions(+), 104 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 24899214eb7..1b0a3987d06 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,13 +1,12 @@ -# TODO: Test failures on badly defined evaluators +# TODO: Test failures on eval, how do we set errors # TODO: Test workflows for re-evals and publishing results -# TODO: Idempotency of push/pull methods import concurrent.futures from datetime import datetime import json import os import time -from typing import Any, Callable, Dict, Iterator, List, Optional +from typing import Any, Callable, Dict, Iterator, List, Optional, Union import inspect from functools import wraps from urllib.parse import quote @@ -43,7 +42,14 @@ class Dataset: description (str): Optional description of the dataset """ - def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None: + def __init__(self, name: str, data: List[Dict[str, Union[str, Dict[str, Any]]]], description: str = "") -> None: + """ + Args: + name: Name of the dataset + data: List of dictionaries where 'input' and 'expected_output' values can be + either strings or dictionaries of strings + description: Optional description of the dataset + """ self.name = name self.description = description self._validate_data(data) @@ -52,16 +58,36 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") # Post-push attributes self._datadog_dataset_id = None - def __iter__(self) -> Iterator[Dict[str, Any]]: + def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]: return iter(self._data) def __len__(self) -> int: return len(self._data) - def __getitem__(self, index: int) -> Dict[str, Any]: - return self._data[index] - - def _validate_data(self, data: List[Dict[str, Any]]) -> None: + def __getitem__(self, index: int) -> Dict[str, Union[str, Dict[str, Any]]]: + """Get a dataset record, converting _str_value dictionaries back to strings. + + Args: + index: Index of the record to retrieve + + Returns: + Dict containing the record with any _str_value values converted to strings + """ + record = self._data[index].copy() + + # Convert input if it has _str_value + if 'input' in record and isinstance(record['input'], dict): + if '_str_value' in record['input'] and len(record['input']) == 1: + record['input'] = record['input']['_str_value'] + + # Convert expected_output if it has _str_value + if 'expected_output' in record and isinstance(record['expected_output'], dict): + if '_str_value' in record['expected_output'] and len(record['expected_output']) == 1: + record['expected_output'] = record['expected_output']['_str_value'] + + return record + + def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> None: """Validate the format and structure of dataset records. Args: @@ -69,8 +95,7 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None: Raises: ValueError: If data is empty, contains non-dictionary rows, - has inconsistent keys, contains nested dictionaries, - or exceeds 50,000 rows + has inconsistent keys, or exceeds 50,000 rows """ if not data: raise ValueError("Data cannot be empty.") @@ -86,20 +111,27 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None: if set(row.keys()) != first_row_keys: raise ValueError("All rows must have the same keys.") - # Validate that 'input' exists and is a dictionary - if 'input' not in row: - raise ValueError("Each row must contain an 'input' field") - if not isinstance(row['input'], dict): - raise ValueError("The 'input' field must be a dictionary") - - # If expected_output exists, validate it's a dictionary - if 'expected_output' in row and not isinstance(row['expected_output'], dict): - raise ValueError("The 'expected_output' field must be a dictionary") - - # Check that 'input' and 'expected_output' are flat dictionaries - for key in ["input", "expected_output"]: - if key in row and any(isinstance(value, dict) for value in row[key].values()): - raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).") + # Validate input if present + if 'input' in row: + if isinstance(row['input'], str): + # Convert string to dict with _str_value key + row['input'] = {'_str_value': row['input']} + elif isinstance(row['input'], dict): + # Do nothing + pass + else: + raise ValueError("The 'input' field must be either a string or a dictionary") + + # Validate expected_output if present + if 'expected_output' in row: + if isinstance(row['expected_output'], str): + # Convert string to dict with _str_value key + row['expected_output'] = {'_str_value': row['expected_output']} + elif isinstance(row['expected_output'], dict): + # Do nothing + pass + else: + raise ValueError("The 'expected_output' field must be either a string or a dictionary") @classmethod def pull(cls, name: str) -> "Dataset": @@ -137,13 +169,26 @@ def pull(cls, name: str) -> "Dataset": class_records = [] for record in records_data.get("data", []): attrs = record.get("attributes", {}) - class_records.append( - { - "input": attrs.get("input", {}), - "expected_output": attrs.get("expected_output", {}), - **attrs.get("metadata", {}), - } - ) + input_data = attrs.get("input") + expected_output = attrs.get("expected_output") + + print(input_data, expected_output) + + # Handle input data format + if isinstance(input_data, str): + input_data = {'_str_value': input_data} + # For dictionaries, keep as-is (no conversion needed) + + # Handle expected output format + if isinstance(expected_output, str): + expected_output = {'_str_value': expected_output} + # For dictionaries, keep as-is (no conversion needed) + + class_records.append({ + "input": input_data, + "expected_output": expected_output, + **attrs.get("metadata", {}), + }) # Create new dataset instance dataset = cls(name, class_records) @@ -154,7 +199,7 @@ def push(self) -> None: """Push the dataset to Datadog. Returns: - Dict[str, str]: Dictionary containing dataset information including: + Dict[str, Any]: Dictionary containing dataset information including: - dataset_id: The ID of the created/updated dataset - dataset_name: The name of the dataset - record_count: Number of records uploaded @@ -256,8 +301,18 @@ def from_csv( raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}") for row in rows: - input_data = {col: row[col] for col in input_columns} - expected_output_data = {col: row[col] for col in expected_output_columns} + # If single column, use string value wrapped in dict + if len(input_columns) == 1: + input_data = {'_str_value': row[input_columns[0]]} + else: + input_data = {col: row[col] for col in input_columns} + + # If single column, use string value wrapped in dict + if len(expected_output_columns) == 1: + expected_output_data = {'_str_value': row[expected_output_columns[0]]} + else: + expected_output_data = {col: row[col] for col in expected_output_columns} + metadata = {} if metadata_columns: metadata = {col: row[col] for col in metadata_columns} @@ -479,27 +534,70 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": ) if multiindex: - # Create a list of flattened dictionaries - flattened_data = [] + column_tuples = set() + data_rows = [] for record in self._data: flat_record = {} + # Handle 'input' fields - for k, v in record.get('input', {}).items(): - flat_record[('input', k)] = v + input_data = record.get('input', {}) + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: + flat_record[('input', '')] = input_data['_str_value'] + column_tuples.add(('input', '')) + else: + for k, v in input_data.items(): + flat_record[('input', k)] = v + column_tuples.add(('input', k)) + # Handle 'expected_output' fields - for k, v in record.get('expected_output', {}).items(): - flat_record[('expected_output', k)] = v + expected_output = record.get('expected_output', {}) + if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: + flat_record[('expected_output', '')] = expected_output['_str_value'] + column_tuples.add(('expected_output', '')) + else: + for k, v in expected_output.items(): + flat_record[('expected_output', k)] = v + column_tuples.add(('expected_output', k)) + # Handle any other top-level fields for k, v in record.items(): if k not in ['input', 'expected_output']: flat_record[('metadata', k)] = v - flattened_data.append(flat_record) + column_tuples.add(('metadata', k)) + data_rows.append(flat_record) + + # Convert column_tuples to a sorted list to maintain consistent column order + column_tuples = sorted(list(column_tuples)) + + # Build the DataFrame + records_list = [] + for flat_record in data_rows: + row = [flat_record.get(col, None) for col in column_tuples] + records_list.append(row) + + df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) - df = pd.DataFrame(flattened_data) - # Set columns as MultiIndex - df.columns = pd.MultiIndex.from_tuples(df.columns) return df - return pd.DataFrame(self._data) + + else: + # For non-multiindex, convert _str_value in the nested structures + data = [] + for record in self._data: + new_record = {} + input_data = record.get('input', {}) + new_record['input'] = (input_data['_str_value'] + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 + else input_data) + expected_output = record.get('expected_output', {}) + new_record['expected_output'] = (expected_output['_str_value'] + if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1 + else expected_output) + # Copy other fields + for k, v in record.items(): + if k not in ['input', 'expected_output']: + new_record[k] = v + data.append(new_record) + return pd.DataFrame(data) def export_to_jsonl(self, file_path): """ @@ -600,7 +698,7 @@ def run_task( """ if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") - if retries < 0: + if _retries < 0: raise ValueError("Number of retries must be non-negative") self.outputs = [] total_rows = len(self.dataset) @@ -611,12 +709,13 @@ def process_row(idx_row): attempt = 0 delay = 1.0 # Initial delay in seconds - while attempt <= retries: + while attempt <= _retries: start_time = time.time() try: - # Extract the input data + # Extract the input data and convert if it's a _str_value dict input_data = row['input'] - + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: + input_data = input_data['_str_value'] def execute_task(): if getattr(self.task, '_accepts_config', False): @@ -626,11 +725,12 @@ def execute_task(): # Use ThreadPoolExecutor to enforce timeout with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor: future = single_executor.submit(execute_task) - output = future.result(timeout=timeout) + output = future.result(timeout=_timeout) - - # Ensure output is a dictionary - if not isinstance(output, dict): + # Ensure output is a dictionary with _str_value for strings + if isinstance(output, str): + output = {'_str_value': output} + elif not isinstance(output, dict): output = {'value': output} # Prepare output data @@ -654,12 +754,13 @@ def execute_task(): return output_data except concurrent.futures.TimeoutError as e: + print(f"Timeout error: {e}") if raise_on_error: # Raise specific experiment task error - raise ExperimentTaskError(f"Task timed out after {timeout} seconds", idx, e) - if attempt < retries: + raise ExperimentTaskError(f"Task timed out after {_timeout} seconds", idx, e) + if attempt < _retries: # Exponential backoff and retry - sleep_time = min(delay, max_delay) + sleep_time = min(delay, _max_delay) time.sleep(sleep_time) delay *= 2 attempt += 1 @@ -677,7 +778,7 @@ def execute_task(): "dataset_name": self.dataset.name, }, "error": { - "message": f"Task timed out after {timeout} seconds", + "message": f"Task timed out after {_timeout} seconds", "stack": None, "type": "TimeoutError", } @@ -685,12 +786,13 @@ def execute_task(): return output_data except Exception as e: + print(f"Error: {e}") if raise_on_error: # Raise specific experiment task error raise ExperimentTaskError(str(e), idx, e) - if attempt < retries: + if attempt < _retries: # Exponential backoff and retry - sleep_time = min(delay, max_delay) + sleep_time = min(delay, _max_delay) time.sleep(sleep_time) delay *= 2 attempt += 1 @@ -740,6 +842,7 @@ def execute_task(): error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}") break except Exception as e: + print(f"Error: {e}") outputs_buffer[idx] = { "idx": idx, "output": None, @@ -766,6 +869,7 @@ def execute_task(): _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') finally: if error_occurred: + print(f"Error occurred: {error_exception}") # Cancel all pending futures for future in futures: future.cancel() @@ -810,23 +914,37 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_ total_rows = len(self.outputs) completed = 0 - # Initialize the progress bar _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete') for idx, output_data in enumerate(self.outputs): try: - # Retrieve output from outputs output = output_data["output"] + # Convert output if it has '_str_value' + if isinstance(output, dict) and '_str_value' in output and len(output) == 1: + output = output['_str_value'] + # Get the corresponding dataset row dataset_row = self.dataset[idx] input_data = dataset_row.get('input', {}) expected_output = dataset_row.get('expected_output', {}) + + # Convert input_data if it has '_str_value' + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: + input_data = input_data['_str_value'] + + # Convert expected_output if it has '_str_value' + if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: + expected_output = expected_output['_str_value'] # Perform evaluation evaluations_dict = {} for evaluator in evaluators_to_use: - evaluation_result = evaluator(expected_output, output, input_data) - evaluations_dict[evaluator.__name__] = evaluation_result + try: + evaluation_result = evaluator(expected_output, output, input_data) + evaluations_dict[evaluator.__name__] = evaluation_result + except Exception as e: + print(f"Error evaluating row {idx}: {type(e).__name__}: {e}, with evaluator {evaluator.__name__}") + raise e # Store evaluation results evaluations.append({ @@ -907,7 +1025,7 @@ def _merge_results(self) -> List[Dict[str, Any]]: for idx in range(len(self.outputs)): output_data = self.outputs[idx] evaluation_data = self.evaluations[idx] - dataset_record = self.dataset[idx] + dataset_record = self.dataset._data[idx] merged_result = { "idx": idx, @@ -929,14 +1047,39 @@ def __len__(self) -> int: return len(self.merged_results) def __getitem__(self, index: int) -> Any: - return self.merged_results[index] + """Get a result record, converting _str_value dictionaries back to strings. + + Args: + index: Index of the record to retrieve + + Returns: + Dict containing the record with any _str_value values converted to strings + """ + result = self.merged_results[index].copy() + + # Convert input if it has _str_value + if 'input' in result and isinstance(result['input'], dict): + if '_str_value' in result['input'] and len(result['input']) == 1: + result['input'] = result['input']['_str_value'] + + # Convert expected_output if it has _str_value + if 'expected_output' in result and isinstance(result['expected_output'], dict): + if '_str_value' in result['expected_output'] and len(result['expected_output']) == 1: + result['expected_output'] = result['expected_output']['_str_value'] + + # Convert output if it has _str_value + if 'output' in result and isinstance(result['output'], dict): + if '_str_value' in result['output'] and len(result['output']) == 1: + result['output'] = result['output']['_str_value'] + + return result def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": """Convert the experiment results to a pandas DataFrame, including the experiment config. Args: multiindex (bool): If True, expand nested dictionaries into MultiIndex columns. - If False, keep the nested dictionaries as they are. + If False, keep the nested dictionaries as they are. Returns: pd.DataFrame: A DataFrame representation of the experiment results. @@ -952,59 +1095,119 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": "Please install it with `pip install pandas`" ) - data = [] + # Define the desired column order + COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error'] + + data_rows = [] + column_tuples = set() for result in self.merged_results: record = {} + if multiindex: - # Flatten 'input' - for k, v in result['input'].items(): - record[('input', k)] = v - # Flatten 'expected_output' - for k, v in result['expected_output'].items(): - record[('expected_output', k)] = v - # Flatten 'output' + # Handle 'input' fields + input_data = result.get('input', {}) + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: + record[('input', '')] = input_data['_str_value'] + column_tuples.add(('input', '')) + else: + for k, v in input_data.items(): + record[('input', k)] = v + column_tuples.add(('input', k)) + + # Handle 'expected_output' fields + expected_output = result.get('expected_output', {}) + if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: + record[('expected_output', '')] = expected_output['_str_value'] + column_tuples.add(('expected_output', '')) + else: + for k, v in expected_output.items(): + record[('expected_output', k)] = v + column_tuples.add(('expected_output', k)) + + # Handle 'output' fields output = result.get('output', {}) if isinstance(output, dict): - for k, v in output.items(): - record[('output', k)] = v - # Flatten 'evaluations' - for eval_name, eval_result in result['evaluations'].items(): + if '_str_value' in output and len(output) == 1: + record[('output', '')] = output['_str_value'] + column_tuples.add(('output', '')) + else: + for k, v in output.items(): + record[('output', k)] = v + column_tuples.add(('output', k)) + else: + record[('output', '')] = output + column_tuples.add(('output', '')) + + # Handle 'evaluations' fields + evaluations = result.get('evaluations', {}) + for eval_name, eval_result in evaluations.items(): if isinstance(eval_result, dict): for k, v in eval_result.items(): record[('evaluations', eval_name, k)] = v + column_tuples.add(('evaluations', eval_name, k)) else: record[('evaluations', eval_name)] = eval_result - # Flatten 'metadata' + column_tuples.add(('evaluations', eval_name)) + + # Handle 'metadata' fields for k, v in result.get('metadata', {}).items(): record[('metadata', k)] = v - # Include 'config' from the experiment + column_tuples.add(('metadata', k)) + + # Handle 'config' fields if self.experiment.config: for k, v in self.experiment.config.items(): record[('config', k)] = v - # Flatten 'error' - error = result['error'] + column_tuples.add(('config', k)) + + # Handle 'error' fields + error = result.get('error', {}) if error: - record[('error', 'message')] = error.get('message') - record[('error', 'type')] = error.get('type') - record[('error', 'stack')] = error.get('stack') - + for k, v in error.items(): + record[('error', k)] = v + column_tuples.add(('error', k)) + + data_rows.append(record) else: - # Keep nested structures - record['input'] = result['input'] - record['expected_output'] = result['expected_output'] - record['output'] = result.get('output') - record['evaluations'] = result.get('evaluations') - record['metadata'] = result.get('metadata') - record['config'] = self.experiment.config - record['error'] = result.get('error') - data.append(record) + # Non-multiindex implementation remains the same + new_record = {} + input_data = result.get('input', {}) + new_record['input'] = (input_data['_str_value'] + if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 + else input_data) + expected_output = result.get('expected_output', {}) + new_record['expected_output'] = (expected_output['_str_value'] + if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1 + else expected_output) + output = result.get('output', {}) + new_record['output'] = (output['_str_value'] + if isinstance(output, dict) and '_str_value' in output and len(output) == 1 + else output) + new_record['evaluations'] = result.get('evaluations', {}) + new_record['metadata'] = result.get('metadata', {}) + new_record['config'] = self.experiment.config + new_record['error'] = result.get('error', {}) + data_rows.append(new_record) - - df = pd.DataFrame(data) if multiindex: - df.columns = pd.MultiIndex.from_tuples(df.columns) - return df + # Sort column_tuples based on the desired order + column_tuples = sorted(list(column_tuples), + key=lambda x: (COLUMN_ORDER.index(x[0]), x[1:] if len(x) > 1 else '')) + + # Build the DataFrame + records_list = [] + for record in data_rows: + row = [record.get(col, None) for col in column_tuples] + records_list.append(row) + + df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) + return df + else: + df = pd.DataFrame(data_rows) + # Reorder columns according to COLUMN_ORDER + cols = [col for col in COLUMN_ORDER if col in df.columns] + return df[cols] def push(self, overwrite: bool = False) -> None: """Push the experiment results to Datadog. @@ -1165,6 +1368,8 @@ def push(self, overwrite: bool = False) -> None: metrics.append(metric) + + # Prepare payload and send to Datadog results_payload = { "data": { @@ -1174,7 +1379,6 @@ def push(self, overwrite: bool = False) -> None: } } - print(json.dumps(results_payload, indent=2)) url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) @@ -1243,7 +1447,7 @@ def task(func): raise ValueError("Function name 'task' is reserved. Please use a different name for your task function.") @wraps(func) - def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Any: + def wrapper(input: Dict[str, Union[str, Dict[str, Any]]], config: Optional[Dict[str, Any]] = None) -> Any: # Call the original function with or without config if 'config' in inspect.signature(func).parameters: return func(input, config) @@ -1261,8 +1465,8 @@ def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> A def evaluator(func): @wraps(func) - def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any] = None) -> Any: - return func(expected_output, output, input) + def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict[str, Any]], input: Union[str, Dict[str, Any]] = None) -> Any: + return func(expected_output, output, input) # Enforce signature compliance sig = inspect.signature(func) params = sig.parameters From a228c30a61458d8654ac7c599cfae0d90c8ba540 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Fri, 6 Dec 2024 13:45:39 -0500 Subject: [PATCH 25/36] structure changes --- ddtrace/llmobs/__init__.py | 3 +- ddtrace/llmobs/_experiments.py | 819 ++++++++------------------------- 2 files changed, 194 insertions(+), 628 deletions(-) diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index bd382219754..549f83ad88f 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -12,8 +12,7 @@ from ._experiments import FileType from ._experiments import task from ._experiments import evaluator -from ._experiments import ExperimentGrid from ._llmobs import LLMObs -__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator", "ExperimentGrid"] +__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 1b0a3987d06..b53f1c11708 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,5 +1,14 @@ -# TODO: Test failures on eval, how do we set errors +# TODO: Test failures on eval, how do we set errors, Report null when evaluator fails # TODO: Test workflows for re-evals and publishing results +# TODO: Test pushing experiments without data + +""" +Test coverage ideas: +- Define task and evaluator wrong +- Define experiment wrong +- Experiments with failures +- Eval failures +""" import concurrent.futures from datetime import datetime @@ -22,13 +31,11 @@ import ddtrace DD_SITE = os.getenv("DD_SITE", "datadoghq.com") -BASE_URL = f"https://api.{DD_SITE}" +BASE_URL = f"https://api.{DD_SITE}" #TODO: Change to https://api.{DD_SITE} when testing is complete in staging class FileType(Enum): CSV = 'csv' - PARQUET = 'parquet' - JSONL = 'jsonl' class Dataset: @@ -65,26 +72,15 @@ def __len__(self) -> int: return len(self._data) def __getitem__(self, index: int) -> Dict[str, Union[str, Dict[str, Any]]]: - """Get a dataset record, converting _str_value dictionaries back to strings. + """Get a dataset record. Args: index: Index of the record to retrieve Returns: - Dict containing the record with any _str_value values converted to strings + Dict containing the record. """ record = self._data[index].copy() - - # Convert input if it has _str_value - if 'input' in record and isinstance(record['input'], dict): - if '_str_value' in record['input'] and len(record['input']) == 1: - record['input'] = record['input']['_str_value'] - - # Convert expected_output if it has _str_value - if 'expected_output' in record and isinstance(record['expected_output'], dict): - if '_str_value' in record['expected_output'] and len(record['expected_output']) == 1: - record['expected_output'] = record['expected_output']['_str_value'] - return record def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> None: @@ -111,28 +107,6 @@ def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> N if set(row.keys()) != first_row_keys: raise ValueError("All rows must have the same keys.") - # Validate input if present - if 'input' in row: - if isinstance(row['input'], str): - # Convert string to dict with _str_value key - row['input'] = {'_str_value': row['input']} - elif isinstance(row['input'], dict): - # Do nothing - pass - else: - raise ValueError("The 'input' field must be either a string or a dictionary") - - # Validate expected_output if present - if 'expected_output' in row: - if isinstance(row['expected_output'], str): - # Convert string to dict with _str_value key - row['expected_output'] = {'_str_value': row['expected_output']} - elif isinstance(row['expected_output'], dict): - # Do nothing - pass - else: - raise ValueError("The 'expected_output' field must be either a string or a dictionary") - @classmethod def pull(cls, name: str) -> "Dataset": """Create a dataset from a dataset hosted in Datadog. @@ -171,18 +145,6 @@ def pull(cls, name: str) -> "Dataset": attrs = record.get("attributes", {}) input_data = attrs.get("input") expected_output = attrs.get("expected_output") - - print(input_data, expected_output) - - # Handle input data format - if isinstance(input_data, str): - input_data = {'_str_value': input_data} - # For dictionaries, keep as-is (no conversion needed) - - # Handle expected output format - if isinstance(expected_output, str): - expected_output = {'_str_value': expected_output} - # For dictionaries, keep as-is (no conversion needed) class_records.append({ "input": input_data, @@ -254,7 +216,6 @@ def from_csv( delimiter: str = ",", input_columns: List[str] = None, expected_output_columns: List[str] = None, - metadata_columns: List[str] = None, ) -> "Dataset": """Create a Dataset from a CSV file. @@ -265,7 +226,6 @@ def from_csv( delimiter: CSV delimiter character, defaults to comma input_columns: List of column names to use as input data expected_output_columns: List of column names to use as expected output data - metadata_columns: Optional list of column names to include as metadata Returns: Dataset: A new Dataset instance containing the CSV data @@ -289,33 +249,30 @@ def from_csv( header_columns = reader.fieldnames missing_input_columns = [col for col in input_columns if col not in header_columns] missing_output_columns = [col for col in expected_output_columns if col not in header_columns] - missing_metadata_columns = [] - if metadata_columns: - missing_metadata_columns = [col for col in metadata_columns if col not in header_columns] if missing_input_columns: raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}") if missing_output_columns: raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}") - if missing_metadata_columns: - raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}") + + # Get metadata columns (all columns not used for input or expected output) + metadata_columns = [col for col in header_columns if col not in input_columns and col not in expected_output_columns] for row in rows: - # If single column, use string value wrapped in dict + # Handle input data if len(input_columns) == 1: - input_data = {'_str_value': row[input_columns[0]]} + input_data = row[input_columns[0]] else: input_data = {col: row[col] for col in input_columns} - # If single column, use string value wrapped in dict + # Handle expected output data if len(expected_output_columns) == 1: - expected_output_data = {'_str_value': row[expected_output_columns[0]]} + expected_output_data = row[expected_output_columns[0]] else: expected_output_data = {col: row[col] for col in expected_output_columns} - metadata = {} - if metadata_columns: - metadata = {col: row[col] for col in metadata_columns} + # Handle metadata (all remaining columns) + metadata = {col: row[col] for col in metadata_columns} data.append({ 'input': input_data, @@ -333,134 +290,6 @@ def from_csv( return cls(name=name, data=data, description=description) - @classmethod - def _from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": - """Create a Dataset from a JSONL file. - - Args: - filepath: Path to the JSONL file - name: Name of the dataset - description: Optional description of the dataset - input_columns: List of column names to use as input data - expected_output_columns: List of column names to use as expected output data - metadata_columns: Optional list of column names to include as metadata - - Returns: - Dataset: A new Dataset instance containing the JSONL data - - Raises: - ValueError: If input_columns or expected_output_columns are not provided - Exception: If there are issues reading the JSONL file - """ - if input_columns is None or expected_output_columns is None: - raise ValueError("`input_columns` and `expected_output_columns` must be provided.") - - data = [] - try: - with open(filepath, mode='r', encoding='utf-8') as jsonlfile: - for line in jsonlfile: - row = json.loads(line.strip()) - - input_data = {col: row.get(col) for col in input_columns} - expected_output_data = {col: row.get(col) for col in expected_output_columns} - metadata = {} - if metadata_columns: - metadata = {col: row.get(col) for col in metadata_columns} - - data.append({ - 'input': input_data, - 'expected_output': expected_output_data, - **metadata, - }) - - if not data: - raise ValueError("JSONL file is empty.") - - except FileNotFoundError as e: - raise DatasetFileError(f"JSONL file not found: {filepath}") from e - except PermissionError as e: - raise DatasetFileError(f"Permission denied when reading JSONL file: {filepath}") from e - except json.JSONDecodeError as e: - raise DatasetFileError(f"Error parsing JSONL file: {e}") from e - except Exception as e: - raise DatasetFileError(f"Unexpected error reading JSONL file: {e}") from e - - return cls(name=name, data=data, description=description) - - @classmethod - def _from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset": - """Create a Dataset from a Parquet file. - - Args: - filepath: Path to the Parquet file - name: Name of the dataset - description: Optional description of the dataset - input_columns: List of column names to use as input data - expected_output_columns: List of column names to use as expected output data - metadata_columns: Optional list of column names to include as metadata - - Returns: - Dataset: A new Dataset instance containing the Parquet data - - Raises: - ImportError: If pandas is not installed - ValueError: If input_columns or expected_output_columns are not provided, - if the Parquet file is empty, or if specified columns are missing - Exception: If there are issues reading the Parquet file - """ - try: - import pandas as pd - except ImportError: - raise ImportError( - "pandas is required to read parquet files. " - "Please install pandas with: pip install pandas" - ) - - if input_columns is None or expected_output_columns is None: - raise ValueError("`input_columns` and `expected_output_columns` must be provided.") - - data = [] - try: - df = pd.read_parquet(filepath) - if df.empty: - raise ValueError("Parquet file is empty.") - - # Ensure that the specified columns are present - missing_input_columns = [col for col in input_columns if col not in df.columns] - missing_output_columns = [col for col in expected_output_columns if col not in df.columns] - missing_metadata_columns = [] - if metadata_columns: - missing_metadata_columns = [col for col in metadata_columns if col not in df.columns] - - if missing_input_columns: - raise ValueError(f"Input columns not found in DataFrame: {missing_input_columns}") - if missing_output_columns: - raise ValueError(f"Expected output columns not found in DataFrame: {missing_output_columns}") - if missing_metadata_columns: - raise ValueError(f"Metadata columns not found in DataFrame: {missing_metadata_columns}") - - for idx, row in df.iterrows(): - input_data = {col: row[col] for col in input_columns} - expected_output_data = {col: row[col] for col in expected_output_columns} - metadata = {} - if metadata_columns: - metadata = {col: row[col] for col in metadata_columns} - - data.append({ - 'input': input_data, - 'expected_output': expected_output_data, - **metadata, - }) - - except FileNotFoundError as e: - raise DatasetFileError(f"Parquet file not found: {filepath}") from e - except PermissionError as e: - raise DatasetFileError(f"Permission denied when reading Parquet file: {filepath}") from e - except Exception as e: - raise DatasetFileError(f"Error reading Parquet file: {e}") from e - - return cls(name=name, data=data, description=description) - @classmethod def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": """Import a dataset from a file. @@ -491,24 +320,6 @@ def load(cls, path: str, filetype: FileType, name: str, description: str = "", i expected_output_columns=expected_output_columns, metadata_columns=metadata_columns, ) - elif filetype == FileType.JSONL: - return cls._from_jsonl( - filepath=path, - name=name, - description=description, - input_columns=input_columns, - expected_output_columns=expected_output_columns, - metadata_columns=metadata_columns, - ) - elif filetype == FileType.PARQUET: - return cls._from_parquet( - filepath=path, - name=name, - description=description, - input_columns=input_columns, - expected_output_columns=expected_output_columns, - metadata_columns=metadata_columns, - ) else: raise ValueError(f"Unsupported file type: {filetype}") @@ -541,23 +352,23 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": # Handle 'input' fields input_data = record.get('input', {}) - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: - flat_record[('input', '')] = input_data['_str_value'] - column_tuples.add(('input', '')) - else: + if isinstance(input_data, dict): for k, v in input_data.items(): flat_record[('input', k)] = v column_tuples.add(('input', k)) + else: + flat_record[('input', '')] = input_data + column_tuples.add(('input', '')) # Handle 'expected_output' fields expected_output = record.get('expected_output', {}) - if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: - flat_record[('expected_output', '')] = expected_output['_str_value'] - column_tuples.add(('expected_output', '')) - else: + if isinstance(expected_output, dict): for k, v in expected_output.items(): flat_record[('expected_output', k)] = v column_tuples.add(('expected_output', k)) + else: + flat_record[('expected_output', '')] = expected_output + column_tuples.add(('expected_output', '')) # Handle any other top-level fields for k, v in record.items(): @@ -580,18 +391,13 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": return df else: - # For non-multiindex, convert _str_value in the nested structures data = [] for record in self._data: new_record = {} input_data = record.get('input', {}) - new_record['input'] = (input_data['_str_value'] - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 - else input_data) + new_record['input'] = input_data expected_output = record.get('expected_output', {}) - new_record['expected_output'] = (expected_output['_str_value'] - if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1 - else expected_output) + new_record['expected_output'] = expected_output # Copy other fields for k, v in record.items(): if k not in ['input', 'expected_output']: @@ -676,179 +482,103 @@ def __init__( def run_task( self, _jobs: int = 10, - _timeout: Optional[float] = None, - _retries: int = 0, - _max_delay: float = 60.0, - raise_on_error: bool = False, + raise_errors: bool = False, ) -> None: """Execute the task function on the dataset and store the outputs. Args: _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10. - timeout: Maximum time in seconds to wait for each task execution. - If None, will wait indefinitely. Defaults to None. - retries: Number of retry attempts for failed tasks. Defaults to 0. - max_delay: Maximum delay in seconds between retries using exponential backoff. - Defaults to 60 seconds. - raise_on_error: If True, raises exceptions from failed tasks. If False, stores + raise_errors: If True, raises exceptions from failed tasks. If False, stores errors in the output. Defaults to False. Raises: - ValueError: If _jobs is not between 1 and 20, or if retries is negative. + ValueError: If _jobs is not between 1 and 20 """ if not 1 <= _jobs <= 20: raise ValueError("Number of jobs must be between 1 and 20") - if _retries < 0: - raise ValueError("Number of retries must be non-negative") + self.outputs = [] total_rows = len(self.dataset) completed = 0 + error_count = 0 + error_messages = [] def process_row(idx_row): idx, row = idx_row - attempt = 0 - delay = 1.0 # Initial delay in seconds - - while attempt <= _retries: - start_time = time.time() - try: - # Extract the input data and convert if it's a _str_value dict - input_data = row['input'] - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: - input_data = input_data['_str_value'] - - def execute_task(): - if getattr(self.task, '_accepts_config', False): - return self.task(input_data, self.config) - return self.task(input_data) - - # Use ThreadPoolExecutor to enforce timeout - with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor: - future = single_executor.submit(execute_task) - output = future.result(timeout=_timeout) - - # Ensure output is a dictionary with _str_value for strings - if isinstance(output, str): - output = {'_str_value': output} - elif not isinstance(output, dict): - output = {'value': output} - - # Prepare output data - output_data = { - "idx": idx, - "output": output, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": { - "message": None, - "stack": None, - "type": None, - } + start_time = time.time() + try: + input_data = row['input'] + + if getattr(self.task, '_accepts_config', False): + output = self.task(input_data, self.config) + else: + output = self.task(input_data) + + output_data = { + "idx": idx, + "output": output, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": None, + "stack": None, + "type": None, } - return output_data - - except concurrent.futures.TimeoutError as e: - print(f"Timeout error: {e}") - if raise_on_error: - # Raise specific experiment task error - raise ExperimentTaskError(f"Task timed out after {_timeout} seconds", idx, e) - if attempt < _retries: - # Exponential backoff and retry - sleep_time = min(delay, _max_delay) - time.sleep(sleep_time) - delay *= 2 - attempt += 1 - else: - # All retries exhausted, record the timeout error - output_data = { - "idx": idx, - "output": None, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": { - "message": f"Task timed out after {_timeout} seconds", - "stack": None, - "type": "TimeoutError", - } - } - return output_data - - except Exception as e: - print(f"Error: {e}") - if raise_on_error: - # Raise specific experiment task error - raise ExperimentTaskError(str(e), idx, e) - if attempt < _retries: - # Exponential backoff and retry - sleep_time = min(delay, _max_delay) - time.sleep(sleep_time) - delay *= 2 - attempt += 1 - else: - # All retries exhausted, record the error - output_data = { - "idx": idx, - "output": None, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": { - "message": str(e), - "stack": None, - "type": type(e).__name__, - } - } - return output_data + } + return output_data - # Initialize the progress bar - _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') + except Exception as e: + error_message = str(e) + error_messages.append(f"Row {idx}: {error_message}") + return { + "idx": idx, + "output": None, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_idx": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + }, + "error": { + "message": error_message, + "stack": None, + "type": type(e).__name__, + } + } - # Use a flag to determine if an error occurred - error_occurred = False - error_exception = None + _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - # Submit the process_row function to the executor for each dataset record futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} - outputs_buffer = [None] * total_rows + try: for future in concurrent.futures.as_completed(futures): idx = futures[future] - start_time = time.time() try: output_data = future.result() outputs_buffer[idx] = output_data - if raise_on_error and output_data['error']['message']: - # An error occurred; cancel all futures - error_occurred = True - error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}") - break + if raise_errors and output_data['error']['message']: + error_message = output_data['error']['message'] + raise ExperimentTaskError(error_message, idx, output_data['error']['type']) + elif output_data['error']['message']: + error_count += 1 + except Exception as e: - print(f"Error: {e}") outputs_buffer[idx] = { "idx": idx, "output": None, "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, + "timestamp": time.time(), + "duration": 0, "dataset_record_idx": idx, "project_name": self.project_name, "experiment_name": self.name, @@ -860,38 +590,39 @@ def execute_task(): "type": type(e).__name__, } } - if raise_on_error: - # An exception occurred; cancel all futures - error_occurred = True - error_exception = e - break + if raise_errors: + raise e + else: + error_count += 1 + error_messages.append(f"Row {idx}: {str(e)}") + completed += 1 _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') - finally: - if error_occurred: - print(f"Error occurred: {error_exception}") - # Cancel all pending futures - for future in futures: - future.cancel() - # Shutdown the executor immediately - executor.shutdown(wait=False) - raise error_exception + + except Exception as e: + for future in futures: + future.cancel() + executor.shutdown(wait=False) + raise e self.outputs = outputs_buffer self.has_run = True - # Log error statistics if any errors occurred - error_count = sum(1 for output in self.outputs if output['error']['message'] is not None) + error_rate = (error_count / total_rows) * 100 + print(f"\nTask completed with {error_count} errors ({error_rate:.2f}% error rate)") + if error_count > 0: - error_rate = (error_count / total_rows) * 100 - print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") + print("\nError Summary:") + for error_msg in error_messages: + print(f"- {error_msg}") + print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n") - def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_error: bool = False) -> "ExperimentResults": + def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults": """Run evaluators on the outputs and return ExperimentResults. Args: evaluators (Optional[List[Callable]]): List of evaluators to use. If None, uses the experiment's evaluators. - raise_on_error (bool): If True, raises exceptions encountered during evaluation. + raise_errors (bool): If True, raises exceptions encountered during evaluation. Returns: ExperimentResults: A new ExperimentResults instance with the evaluation results. @@ -913,40 +644,31 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_ evaluations = [] total_rows = len(self.outputs) completed = 0 + error_count = 0 + error_messages = [] _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete') for idx, output_data in enumerate(self.outputs): try: output = output_data["output"] - # Convert output if it has '_str_value' - if isinstance(output, dict) and '_str_value' in output and len(output) == 1: - output = output['_str_value'] - - # Get the corresponding dataset row + dataset_row = self.dataset[idx] input_data = dataset_row.get('input', {}) expected_output = dataset_row.get('expected_output', {}) - - # Convert input_data if it has '_str_value' - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: - input_data = input_data['_str_value'] - # Convert expected_output if it has '_str_value' - if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: - expected_output = expected_output['_str_value'] - - # Perform evaluation evaluations_dict = {} for evaluator in evaluators_to_use: try: evaluation_result = evaluator(expected_output, output, input_data) evaluations_dict[evaluator.__name__] = evaluation_result except Exception as e: - print(f"Error evaluating row {idx}: {type(e).__name__}: {e}, with evaluator {evaluator.__name__}") - raise e + error_count += 1 + error_message = f"Row {idx}, Evaluator {evaluator.__name__}: {type(e).__name__}: {e}" + error_messages.append(error_message) + if raise_errors: + raise e - # Store evaluation results evaluations.append({ "idx": idx, "evaluations": evaluations_dict, @@ -954,8 +676,11 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_ }) except Exception as e: - if raise_on_error: + if raise_errors: raise e + error_count += 1 + error_message = f"Row {idx}: {type(e).__name__}: {e}" + error_messages.append(error_message) evaluations.append({ "idx": idx, "evaluations": {}, @@ -969,33 +694,38 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_ completed += 1 _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete') - # Return new ExperimentResults without modifying the experiment's state + error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100 + print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)") + + if error_count > 0: + + print("\nError Summary:") + for error_msg in error_messages: + print(f"- {error_msg}") + print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n") + + self.has_evaluated = True return ExperimentResults(self.dataset, self, self.outputs, evaluations) def run( self, _jobs: int = 10, - timeout: Optional[float] = None, - retries: int = 0, - max_delay: float = 60.0, - raise_on_error: bool = False, + raise_errors: bool = False, ) -> "ExperimentResults": """Execute the task and evaluations, returning the results. Args: _jobs (int): Number of worker threads. timeout (float, optional): Time limit for the task execution in seconds. - retries (int): Number of retries for failed tasks. - max_delay (float): Maximum delay between retries in seconds. - raise_on_error (bool): If True, raises exceptions from failed tasks. If False, stores - errors in the output. Defaults to False. + raise_errors (bool): If True, raises exceptions from failed tasks. If False, stores + errors in the output. Defaults to False. Returns: ExperimentResults: The results of the experiment. """ - self.run_task(_jobs=_jobs, _timeout=timeout, _retries=retries, _max_delay=max_delay, raise_on_error=raise_on_error) - experiment_results = self.run_evaluations(raise_on_error=raise_on_error) - print() # Move to the next line after completion + self.run_task(_jobs=_jobs, raise_errors=raise_errors) + experiment_results = self.run_evaluations(raise_errors=raise_errors) + print() return experiment_results @@ -1047,31 +777,15 @@ def __len__(self) -> int: return len(self.merged_results) def __getitem__(self, index: int) -> Any: - """Get a result record, converting _str_value dictionaries back to strings. + """Get a result record. Args: index: Index of the record to retrieve Returns: - Dict containing the record with any _str_value values converted to strings + Dict containing the record. """ result = self.merged_results[index].copy() - - # Convert input if it has _str_value - if 'input' in result and isinstance(result['input'], dict): - if '_str_value' in result['input'] and len(result['input']) == 1: - result['input'] = result['input']['_str_value'] - - # Convert expected_output if it has _str_value - if 'expected_output' in result and isinstance(result['expected_output'], dict): - if '_str_value' in result['expected_output'] and len(result['expected_output']) == 1: - result['expected_output'] = result['expected_output']['_str_value'] - - # Convert output if it has _str_value - if 'output' in result and isinstance(result['output'], dict): - if '_str_value' in result['output'] and len(result['output']) == 1: - result['output'] = result['output']['_str_value'] - return result def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": @@ -1105,36 +819,29 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": record = {} if multiindex: - # Handle 'input' fields input_data = result.get('input', {}) - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1: - record[('input', '')] = input_data['_str_value'] - column_tuples.add(('input', '')) - else: + if isinstance(input_data, dict): for k, v in input_data.items(): record[('input', k)] = v column_tuples.add(('input', k)) + else: + record[('input', '')] = input_data + column_tuples.add(('input', '')) - # Handle 'expected_output' fields expected_output = result.get('expected_output', {}) - if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1: - record[('expected_output', '')] = expected_output['_str_value'] - column_tuples.add(('expected_output', '')) - else: + if isinstance(expected_output, dict): for k, v in expected_output.items(): record[('expected_output', k)] = v column_tuples.add(('expected_output', k)) + else: + record[('expected_output', '')] = expected_output + column_tuples.add(('expected_output', '')) - # Handle 'output' fields output = result.get('output', {}) if isinstance(output, dict): - if '_str_value' in output and len(output) == 1: - record[('output', '')] = output['_str_value'] - column_tuples.add(('output', '')) - else: - for k, v in output.items(): - record[('output', k)] = v - column_tuples.add(('output', k)) + for k, v in output.items(): + record[('output', k)] = v + column_tuples.add(('output', k)) else: record[('output', '')] = output column_tuples.add(('output', '')) @@ -1173,17 +880,11 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": # Non-multiindex implementation remains the same new_record = {} input_data = result.get('input', {}) - new_record['input'] = (input_data['_str_value'] - if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 - else input_data) + new_record['input'] = input_data expected_output = result.get('expected_output', {}) - new_record['expected_output'] = (expected_output['_str_value'] - if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1 - else expected_output) + new_record['expected_output'] = expected_output output = result.get('output', {}) - new_record['output'] = (output['_str_value'] - if isinstance(output, dict) and '_str_value' in output and len(output) == 1 - else output) + new_record['output'] = output new_record['evaluations'] = result.get('evaluations', {}) new_record['metadata'] = result.get('metadata', {}) new_record['config'] = self.experiment.config @@ -1209,7 +910,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": cols = [col for col in COLUMN_ORDER if col in df.columns] return df[cols] - def push(self, overwrite: bool = False) -> None: + def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite """Push the experiment results to Datadog. Raises: @@ -1248,61 +949,30 @@ def push(self, overwrite: bool = False) -> None: else: project_id = projects[0]["id"] - # Check if experiment exists - encoded_name = quote(self.experiment.name) - url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}" - resp = exp_http_request("GET", url) - response_data = resp.json() - experiments = response_data.get("data", []) - - if not experiments: - # Create new experiment - experiment_payload = { - "data": { - "type": "experiments", - "attributes": { - "name": self.experiment.name, - "description": self.experiment.description, - "dataset_id": self.experiment.dataset._datadog_dataset_id, - "project_id": project_id, - "metadata": { - "tags": self.experiment.tags, - **self.experiment.metadata, - "config": self.experiment.config, - }, - }, - } - } - resp = exp_http_request( - "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") - ) - response_data = resp.json() - experiment_id = response_data["data"]["id"] - else: - # Experiment exists, create a new version - version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - new_experiment_name = f"{self.experiment.name}-{version_suffix}" - experiment_payload = { - "data": { - "type": "experiments", - "attributes": { - "name": new_experiment_name, - "description": self.experiment.description, - "dataset_id": self.experiment.dataset._datadog_dataset_id, - "project_id": project_id, - "metadata": { - **self.experiment.metadata, - "config": self.experiment.config, - }, + # Create new experiment + experiment_payload = { + "data": { + "type": "experiments", + "attributes": { + "name": self.experiment.name, + "description": self.experiment.description, + "dataset_id": self.experiment.dataset._datadog_dataset_id, + "project_id": project_id, + "metadata": { + "tags": self.experiment.tags, + **(self.experiment.metadata or {}), + "config": self.experiment.config, }, - } + "ensure_unique": True, # Generates a new experiment with a unique name if the experiment name already exists + }, } - resp = exp_http_request( - "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") - ) - response_data = resp.json() - experiment_id = response_data["data"]["id"] - self.experiment.name = new_experiment_name + } + resp = exp_http_request( + "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") + ) + response_data = resp.json() + experiment_id = response_data["data"]["id"] + self.experiment.name = response_data["data"]["attributes"]["name"] spans = [] metrics = [] @@ -1431,7 +1101,10 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT full_url = BASE_URL + url resp = http_request(method, full_url, headers=headers, body=body) if resp.status_code == 403: - raise ValueError("API key or application key is incorrect.") + if DD_SITE != "datadoghq.com": + raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.") + else: + raise ValueError("API key or application key is incorrect.") if resp.status_code >= 400: try: error_details = resp.json() @@ -1470,7 +1143,7 @@ def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict # Enforce signature compliance sig = inspect.signature(func) params = sig.parameters - required_params = ['expected_output', 'output', 'input'] + required_params = ['input', 'output', 'expected_output'] if not all(param in params for param in required_params): raise TypeError(f"Evaluator function must have parameters {required_params}.") wrapper._is_evaluator = True # Set attribute to indicate decoration @@ -1481,117 +1154,10 @@ def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, leng percent = f"{100 * (iteration / float(total)):.{decimals}f}" filled_length = int(length * iteration // total) bar = fill * filled_length + '-' * (length - filled_length) - print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r') + # Use carriage return '\r' to overwrite the line + print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r', flush=True) if iteration == total: - print() - - -class ExperimentGrid: - """Class to run a grid of experiments over multiple parameter combinations. - - Attributes: - name (str): Name of the experiment grid. - task (Callable): The task function to execute. - dataset (Dataset): The dataset to use. - evaluators (List[Callable]): List of evaluator functions. - config (Dict[str, List[Any]]): Parameter grid to run over. - tags (List[str]): List of tags. - project_name (str): Name of the project. - description (str): Description of the experiment grid. - metadata (Dict[str, Any]): Metadata dictionary. - experiments (List[Experiment]): List of experiments created. - results (List[ExperimentResults]): List of corresponding results. - """ - - def __init__( - self, - name: str, - task: Callable, - dataset: Dataset, - evaluators: List[Callable], - config: Dict[str, List[Any]], - tags: List[str] = [], - project_name: str = "-", - description: str = "", - metadata: Dict[str, Any] = {}, - ) -> None: - self.name = name - self.task = task - self.dataset = dataset - self.evaluators = evaluators - self.config = config - self.tags = tags - self.project_name = project_name - self.description = description - self.metadata = metadata - self.experiments = [] - self.results = [] - - # Generate all parameter combinations and create experiments - self._generate_experiments() - - def _generate_experiments(self): - keys, values = zip(*self.config.items()) - param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)] - - for params in param_combinations: - # Create config for the experiment - config = params.copy() - - # Compute hash of the config - config_str = json.dumps(config, sort_keys=True) - config_hash = hashlib.md5(config_str.encode('utf-8')).hexdigest() - config_hash_tag = f"config_hash:{config_hash}" - - # Generate a unique name for each experiment - experiment_name = f"{self.name}_" + "_".join(f"{k}_{v}" for k, v in params.items()) - - # Create tags for parameters - param_tags = [f"{k}:{v}" for k, v in params.items()] + [config_hash_tag] - - # Create a new experiment instance with updated config and name - experiment = Experiment( - name=experiment_name, - task=self.task, - dataset=self.dataset, - evaluators=self.evaluators, - tags=self.tags + param_tags, - project_name=self.project_name, - description=self.description, - metadata={**self.metadata, "config": config}, - config=config, - ) - - # Add the experiment to the list without running it - self.experiments.append(experiment) - - def __len__(self): - return len(self.experiments) - - def __getitem__(self, index): - return self.experiments[index] - - # Update the run method to use the pre-generated experiments - def run(self, _jobs: int = 10): - """Run experiments for all combinations of parameters in the grid. - - Args: - _jobs (int): Number of parallel workers for each experiment run. - """ - for experiment in self.experiments: - results = experiment.run(_jobs=_jobs) - self.results.append(results) - - return self.results - - def get_all_results(self) -> List[ExperimentResults]: - """Return all results from the experiment grid. - - Returns: - List[ExperimentResults]: A list of results for each experiment. - """ - return self.results - + print() # Move to the next line after completion class DatasetFileError(Exception): """Exception raised when there are errors reading or processing dataset files.""" @@ -1603,4 +1169,5 @@ class ExperimentTaskError(Exception): def __init__(self, message: str, row_idx: int, original_error: Exception = None): self.row_idx = row_idx self.original_error = original_error - super().__init__(f"Task failed on row {row_idx}: {message}") + super().__init__(message) + From b29fa1def19eef78b38282ffe73154bab11dcca6 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 9 Dec 2024 15:47:59 -0500 Subject: [PATCH 26/36] modifications to types --- ddtrace/llmobs/__init__.py | 4 +--- ddtrace/llmobs/_experiments.py | 42 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index 549f83ad88f..f2e91e8a1ca 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -8,11 +8,9 @@ from ._experiments import Dataset from ._experiments import Experiment -from ._experiments import ExperimentResults -from ._experiments import FileType from ._experiments import task from ._experiments import evaluator from ._llmobs import LLMObs -__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator"] +__all__ = ["LLMObs", "Dataset", "Experiment", "task", "evaluator"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index b53f1c11708..0af191bb9bb 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,7 +1,3 @@ -# TODO: Test failures on eval, how do we set errors, Report null when evaluator fails -# TODO: Test workflows for re-evals and publishing results -# TODO: Test pushing experiments without data - """ Test coverage ideas: - Define task and evaluator wrong @@ -10,20 +6,18 @@ - Eval failures """ +import csv import concurrent.futures -from datetime import datetime +from enum import Enum +from functools import wraps +import hashlib +import inspect import json import os import time from typing import Any, Callable, Dict, Iterator, List, Optional, Union -import inspect -from functools import wraps from urllib.parse import quote import uuid -import csv -from enum import Enum -import itertools -import hashlib from ._utils import HTTPResponse from ._utils import http_request @@ -31,8 +25,10 @@ import ddtrace DD_SITE = os.getenv("DD_SITE", "datadoghq.com") -BASE_URL = f"https://api.{DD_SITE}" #TODO: Change to https://api.{DD_SITE} when testing is complete in staging - +if DD_SITE == "datadoghq.com": + BASE_URL = f"https://api.{DD_SITE}" +else: + BASE_URL = f"https://{DD_SITE}" class FileType(Enum): CSV = 'csv' @@ -660,7 +656,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err evaluations_dict = {} for evaluator in evaluators_to_use: try: - evaluation_result = evaluator(expected_output, output, input_data) + evaluation_result = evaluator(input_data, output, expected_output) evaluations_dict[evaluator.__name__] = evaluation_result except Exception as e: error_count += 1 @@ -694,7 +690,11 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err completed += 1 _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete') - error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100 + if len(evaluators_to_use) > 0: + error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100 + else: + error_rate = 0 + print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)") if error_count > 0: @@ -1101,10 +1101,11 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT full_url = BASE_URL + url resp = http_request(method, full_url, headers=headers, body=body) if resp.status_code == 403: - if DD_SITE != "datadoghq.com": + if not DD_SITE: raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.") else: - raise ValueError("API key or application key is incorrect.") + print(resp.text()) + raise ValueError("DD_API_KEY or DD_APPLICATION_KEY is incorrect.") if resp.status_code >= 400: try: error_details = resp.json() @@ -1138,8 +1139,8 @@ def wrapper(input: Dict[str, Union[str, Dict[str, Any]]], config: Optional[Dict[ def evaluator(func): @wraps(func) - def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict[str, Any]], input: Union[str, Dict[str, Any]] = None) -> Any: - return func(expected_output, output, input) + def wrapper(input: Union[str, Dict[str, Any]] = None, output: Union[str, Dict[str, Any]] = None, expected_output: Union[str, Dict[str, Any]] = None) -> Any: + return func(input, output, expected_output) # Enforce signature compliance sig = inspect.signature(func) params = sig.parameters @@ -1169,5 +1170,4 @@ class ExperimentTaskError(Exception): def __init__(self, message: str, row_idx: int, original_error: Exception = None): self.row_idx = row_idx self.original_error = original_error - super().__init__(message) - + super().__init__(message) \ No newline at end of file From 738cc07dd953a360f8086c485f7a859727e335f8 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 9 Dec 2024 15:49:13 -0500 Subject: [PATCH 27/36] remove unnecessary comments --- ddtrace/llmobs/_experiments.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 0af191bb9bb..311dbf9dd2e 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -1,11 +1,3 @@ -""" -Test coverage ideas: -- Define task and evaluator wrong -- Define experiment wrong -- Experiments with failures -- Eval failures -""" - import csv import concurrent.futures from enum import Enum From 105917276cd706f2eb86f608cdabca19ce8e869e Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 9 Dec 2024 15:52:25 -0500 Subject: [PATCH 28/36] fix code quality violations --- ddtrace/llmobs/_experiments.py | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 311dbf9dd2e..323a20b3bf4 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -308,8 +308,8 @@ def load(cls, path: str, filetype: FileType, name: str, description: str = "", i expected_output_columns=expected_output_columns, metadata_columns=metadata_columns, ) - else: - raise ValueError(f"Unsupported file type: {filetype}") + + raise ValueError(f"Unsupported file type: {filetype}") def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": """Convert the dataset to a pandas DataFrame. @@ -378,20 +378,19 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": return df - else: - data = [] - for record in self._data: - new_record = {} - input_data = record.get('input', {}) - new_record['input'] = input_data - expected_output = record.get('expected_output', {}) - new_record['expected_output'] = expected_output - # Copy other fields - for k, v in record.items(): - if k not in ['input', 'expected_output']: - new_record[k] = v - data.append(new_record) - return pd.DataFrame(data) + data = [] + for record in self._data: + new_record = {} + input_data = record.get('input', {}) + new_record['input'] = input_data + expected_output = record.get('expected_output', {}) + new_record['expected_output'] = expected_output + # Copy other fields + for k, v in record.items(): + if k not in ['input', 'expected_output']: + new_record[k] = v + data.append(new_record) + return pd.DataFrame(data) def export_to_jsonl(self, file_path): """ @@ -896,11 +895,11 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) return df - else: - df = pd.DataFrame(data_rows) - # Reorder columns according to COLUMN_ORDER - cols = [col for col in COLUMN_ORDER if col in df.columns] - return df[cols] + + df = pd.DataFrame(data_rows) + # Reorder columns according to COLUMN_ORDER + cols = [col for col in COLUMN_ORDER if col in df.columns] + return df[cols] def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite """Push the experiment results to Datadog. From 965bdcbdc9e7c2e19524663af14122a7aa8143aa Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Fri, 13 Dec 2024 11:09:58 -0500 Subject: [PATCH 29/36] add test comments --- ddtrace/llmobs/_experiments.py | 216 +++++++++++------------- tests/llmobs/test_llmobs_experiments.py | 9 + 2 files changed, 103 insertions(+), 122 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 323a20b3bf4..c576827b602 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -193,7 +193,7 @@ def push(self) -> None: data = resp.json() # Print url to the dataset in Datadog - print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}") + print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n\n") @classmethod def from_csv( @@ -488,7 +488,6 @@ def run_task( total_rows = len(self.dataset) completed = 0 error_count = 0 - error_messages = [] def process_row(idx_row): idx, row = idx_row @@ -522,7 +521,6 @@ def process_row(idx_row): except Exception as e: error_message = str(e) - error_messages.append(f"Row {idx}: {error_message}") return { "idx": idx, "output": None, @@ -581,7 +579,6 @@ def process_row(idx_row): raise e else: error_count += 1 - error_messages.append(f"Row {idx}: {str(e)}") completed += 1 _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') @@ -596,13 +593,9 @@ def process_row(idx_row): self.has_run = True error_rate = (error_count / total_rows) * 100 - print(f"\nTask completed with {error_count} errors ({error_rate:.2f}% error rate)") - + print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") if error_count > 0: - print("\nError Summary:") - for error_msg in error_messages: - print(f"- {error_msg}") - print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n") + print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.") def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults": """Run evaluators on the outputs and return ExperimentResults. @@ -632,7 +625,6 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err total_rows = len(self.outputs) completed = 0 error_count = 0 - error_messages = [] _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete') @@ -651,8 +643,6 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err evaluations_dict[evaluator.__name__] = evaluation_result except Exception as e: error_count += 1 - error_message = f"Row {idx}, Evaluator {evaluator.__name__}: {type(e).__name__}: {e}" - error_messages.append(error_message) if raise_errors: raise e @@ -666,8 +656,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err if raise_errors: raise e error_count += 1 - error_message = f"Row {idx}: {type(e).__name__}: {e}" - error_messages.append(error_message) + evaluations.append({ "idx": idx, "evaluations": {}, @@ -686,14 +675,10 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err else: error_rate = 0 - print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)") + print(f"Evaluation completed with {error_count} errors ({error_rate:.2f}% error rate)") if error_count > 0: - - print("\nError Summary:") - for error_msg in error_messages: - print(f"- {error_msg}") - print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n") + print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.") self.has_evaluated = True return ExperimentResults(self.dataset, self, self.outputs, evaluations) @@ -716,7 +701,6 @@ def run( """ self.run_task(_jobs=_jobs, raise_errors=raise_errors) experiment_results = self.run_evaluations(raise_errors=raise_errors) - print() return experiment_results @@ -803,103 +787,86 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": # Define the desired column order COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error'] - data_rows = [] - column_tuples = set() - - for result in self.merged_results: - record = {} - - if multiindex: - input_data = result.get('input', {}) - if isinstance(input_data, dict): - for k, v in input_data.items(): - record[('input', k)] = v - column_tuples.add(('input', k)) - else: - record[('input', '')] = input_data - column_tuples.add(('input', '')) - - expected_output = result.get('expected_output', {}) - if isinstance(expected_output, dict): - for k, v in expected_output.items(): - record[('expected_output', k)] = v - column_tuples.add(('expected_output', k)) - else: - record[('expected_output', '')] = expected_output - column_tuples.add(('expected_output', '')) - - output = result.get('output', {}) - if isinstance(output, dict): - for k, v in output.items(): - record[('output', k)] = v - column_tuples.add(('output', k)) - else: - record[('output', '')] = output - column_tuples.add(('output', '')) - - # Handle 'evaluations' fields - evaluations = result.get('evaluations', {}) - for eval_name, eval_result in evaluations.items(): - if isinstance(eval_result, dict): - for k, v in eval_result.items(): - record[('evaluations', eval_name, k)] = v - column_tuples.add(('evaluations', eval_name, k)) - else: - record[('evaluations', eval_name)] = eval_result - column_tuples.add(('evaluations', eval_name)) - - # Handle 'metadata' fields - for k, v in result.get('metadata', {}).items(): - record[('metadata', k)] = v - column_tuples.add(('metadata', k)) - - # Handle 'config' fields - if self.experiment.config: - for k, v in self.experiment.config.items(): - record[('config', k)] = v - column_tuples.add(('config', k)) - - # Handle 'error' fields - error = result.get('error', {}) - if error: - for k, v in error.items(): - record[('error', k)] = v - column_tuples.add(('error', k)) - - data_rows.append(record) - else: - # Non-multiindex implementation remains the same - new_record = {} - input_data = result.get('input', {}) - new_record['input'] = input_data - expected_output = result.get('expected_output', {}) - new_record['expected_output'] = expected_output - output = result.get('output', {}) - new_record['output'] = output - new_record['evaluations'] = result.get('evaluations', {}) - new_record['metadata'] = result.get('metadata', {}) - new_record['config'] = self.experiment.config - new_record['error'] = result.get('error', {}) - data_rows.append(new_record) - - if multiindex: - # Sort column_tuples based on the desired order - column_tuples = sorted(list(column_tuples), - key=lambda x: (COLUMN_ORDER.index(x[0]), x[1:] if len(x) > 1 else '')) - - # Build the DataFrame - records_list = [] - for record in data_rows: - row = [record.get(col, None) for col in column_tuples] - records_list.append(row) - - df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples)) - return df - - df = pd.DataFrame(data_rows) - # Reorder columns according to COLUMN_ORDER - cols = [col for col in COLUMN_ORDER if col in df.columns] - return df[cols] + # Convert merged_results to DataFrame directly + df = pd.DataFrame(self.merged_results) + + if not multiindex: + # Reorder columns according to COLUMN_ORDER + cols = [col for col in COLUMN_ORDER if col in df.columns] + return df[cols] + + # For multiindex, we need to handle each column type differently + result_dfs = [] + + # Handle input column + input_df = pd.DataFrame({'input': df['input'].values}) + + # Handle expected_output column + expected_output_df = pd.DataFrame({'expected_output': df['expected_output'].values}) + + # Handle output column - expand the nested structure + output_df = pd.json_normalize( + df['output'].fillna({}).values, + sep='_' + ).add_prefix('output_') + + # Handle evaluations - flatten the dictionary + evaluations_df = pd.DataFrame(df['evaluations'].values.tolist()) + if not evaluations_df.empty: + evaluations_df = evaluations_df.astype(object) # Ensure columns are of object type + evaluations_df = evaluations_df.add_prefix('evaluations_') + # Replace NaN with None + evaluations_df = evaluations_df.where(pd.notna(evaluations_df), None) + + # Handle metadata - flatten the dictionary + metadata_df = pd.DataFrame(df['metadata'].values.tolist()) + if not metadata_df.empty: + metadata_df = metadata_df.add_prefix('metadata_') + + # Handle config if it exists + if 'config' in df.columns: + config_df = pd.json_normalize( + df['config'].fillna({}).values, + sep='_' + ).add_prefix('config_') + else: + config_df = pd.DataFrame() + + # Handle error column - flatten the dictionary and preserve None values + error_dicts = df['error'].values.tolist() + error_df = pd.DataFrame(error_dicts) + if not error_df.empty: + error_df = error_df.add_prefix('error_') + + # Combine all DataFrames + result_dfs = [ + input_df, + expected_output_df, + output_df, + evaluations_df, + metadata_df, + config_df, + error_df + ] + + # Filter out empty DataFrames and concatenate + result_dfs = [df for df in result_dfs if not df.empty] + final_df = pd.concat(result_dfs, axis=1) + + # Replace NaN with None + final_df = final_df.where(pd.notna(final_df), None) + + # Create MultiIndex columns + new_columns = pd.MultiIndex.from_tuples([ + tuple(col.split('_', 1)) if '_' in col else (col, '') + for col in final_df.columns + ]) + final_df.columns = new_columns + + # Replace NaN with None for the entire DataFrame + final_df = final_df.where(pd.notna(final_df), None) + + return final_df def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite """Push the experiment results to Datadog. @@ -1007,15 +974,21 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite # Add evaluation metrics for metric_name, metric_value in evaluations.items(): + # Skip None values + if metric_value is None: + print(f"Skipping None value for metric: {metric_name}") + continue + timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) # Check for bool first, since bool is a subclass of int - if isinstance(metric_value, bool): + if isinstance(metric_value, (bool, str)): metric_type = "categorical" metric_value = str(metric_value).lower() elif isinstance(metric_value, (int, float)): metric_type = "score" else: + print(f"Unknown metric type: {type(metric_value)}") metric_type = "categorical" metric_value = str(metric_value) @@ -1045,7 +1018,7 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) # Print URL to the experiment in Datadog - print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}") + print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id} \n\n") def export_to_jsonl(self, file_path): """ @@ -1095,7 +1068,6 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT if not DD_SITE: raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.") else: - print(resp.text()) raise ValueError("DD_API_KEY or DD_APPLICATION_KEY is incorrect.") if resp.status_code >= 400: try: diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py index 7d3612e5bd8..5b4224af8c7 100644 --- a/tests/llmobs/test_llmobs_experiments.py +++ b/tests/llmobs/test_llmobs_experiments.py @@ -1,3 +1,12 @@ +""" +Test coverage ideas: +- Define task and evaluator wrong +- Define experiment wrong +- Experiments with failures +- Eval failures +- Test workflows for re-evals and publishing results +""" + import itertools import os from typing import Any From ba8e8070ac9f9a5fd3f6b5b516e7914e7207dc6a Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 13 Jan 2025 13:16:09 -0500 Subject: [PATCH 30/36] add error fields on evals --- ddtrace/llmobs/_experiments.py | 208 ++++++++++++++------------------- 1 file changed, 86 insertions(+), 122 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index c576827b602..72128e04fa8 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -7,6 +7,7 @@ import json import os import time +import traceback from typing import Any, Callable, Dict, Iterator, List, Optional, Union from urllib.parse import quote import uuid @@ -14,6 +15,8 @@ from ._utils import HTTPResponse from ._utils import http_request +from decorators import agent + import ddtrace DD_SITE = os.getenv("DD_SITE", "datadoghq.com") @@ -127,6 +130,9 @@ def pull(cls, name: str) -> "Dataset": resp = exp_http_request("GET", url) records_data = resp.json() + if not records_data.get("data", []): + raise ValueError(f"Dataset '{name}' does not contain any records.") + # Transform records into the expected format class_records = [] for record in records_data.get("data", []): @@ -534,7 +540,7 @@ def process_row(idx_row): }, "error": { "message": error_message, - "stack": None, + "stack": traceback.format_exc(), "type": type(e).__name__, } } @@ -571,7 +577,7 @@ def process_row(idx_row): }, "error": { "message": str(e), - "stack": None, + "stack": traceback.format_exc(), "type": type(e).__name__, } } @@ -624,53 +630,49 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err evaluations = [] total_rows = len(self.outputs) completed = 0 - error_count = 0 + error_count = 0 _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete') for idx, output_data in enumerate(self.outputs): - try: - output = output_data["output"] - - dataset_row = self.dataset[idx] - input_data = dataset_row.get('input', {}) - expected_output = dataset_row.get('expected_output', {}) - - evaluations_dict = {} - for evaluator in evaluators_to_use: - try: - evaluation_result = evaluator(input_data, output, expected_output) - evaluations_dict[evaluator.__name__] = evaluation_result - except Exception as e: - error_count += 1 - if raise_errors: - raise e - - evaluations.append({ - "idx": idx, - "evaluations": evaluations_dict, - "error": None, - }) - - except Exception as e: - if raise_errors: - raise e - error_count += 1 + output = output_data["output"] + dataset_row = self.dataset[idx] + input_data = dataset_row.get('input', {}) + expected_output = dataset_row.get('expected_output', {}) - evaluations.append({ - "idx": idx, - "evaluations": {}, - "error": { - "message": str(e), - "type": type(e).__name__, - "stack": None, - }, - }) + evaluations_dict = {} + + # Run all evaluators for this output + for evaluator in evaluators_to_use: + try: + evaluation_result = evaluator(input_data, output, expected_output) + evaluations_dict[evaluator.__name__] = { + "value": evaluation_result, + "error": None + } + except Exception as e: + error_count += 1 + evaluations_dict[evaluator.__name__] = { + "value": None, + "error": { + "message": str(e), + "type": type(e).__name__, + "stack": traceback.format_exc(), + } + } + if raise_errors: + raise e + + # Add single evaluation entry for this output + evaluations.append({ + "idx": idx, + "evaluations": evaluations_dict + }) completed += 1 _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete') - if len(evaluators_to_use) > 0: + if len(evaluators_to_use) > 0: error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100 else: error_rate = 0 @@ -679,7 +681,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err if error_count > 0: print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.") - + self.has_evaluated = True return ExperimentResults(self.dataset, self, self.outputs, evaluations) @@ -732,15 +734,18 @@ def _merge_results(self) -> List[Dict[str, Any]]: evaluation_data = self.evaluations[idx] dataset_record = self.dataset._data[idx] + # Get base metadata and add tags to it + metadata = output_data.get('metadata', {}) + metadata['tags'] = self.experiment.tags + merged_result = { "idx": idx, "input": dataset_record.get('input', {}), "expected_output": dataset_record.get('expected_output', {}), "output": output_data.get('output'), "evaluations": evaluation_data.get('evaluations', {}), - "metadata": output_data.get('metadata', {}), + "metadata": metadata, "error": output_data.get('error'), - "tags": self.experiment.tags, } merged_results.append(merged_result) return merged_results @@ -764,14 +769,14 @@ def __getitem__(self, index: int) -> Any: return result def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": - """Convert the experiment results to a pandas DataFrame, including the experiment config. + """Convert the experiment results to a pandas DataFrame. Args: - multiindex (bool): If True, expand nested dictionaries into MultiIndex columns. + multiindex (bool): If True, expand input/output/expected_output dictionaries into MultiIndex columns. If False, keep the nested dictionaries as they are. Returns: - pd.DataFrame: A DataFrame representation of the experiment results. + pd.DataFrame: DataFrame representation of the experiment results. Raises: ImportError: If pandas is not installed. @@ -784,88 +789,47 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": "Please install it with `pip install pandas`" ) - # Define the desired column order - COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error'] - # Convert merged_results to DataFrame directly df = pd.DataFrame(self.merged_results) if not multiindex: - # Reorder columns according to COLUMN_ORDER - cols = [col for col in COLUMN_ORDER if col in df.columns] - return df[cols] + return df - # For multiindex, we need to handle each column type differently + # Process input, output, and expected_output with MultiIndex + special_fields = ['input', 'output', 'expected_output'] result_dfs = [] - # Handle input column - input_df = pd.DataFrame({'input': df['input'].values}) - - # Handle expected_output column - expected_output_df = pd.DataFrame({'expected_output': df['expected_output'].values}) - - # Handle output column - expand the nested structure - output_df = pd.json_normalize( - df['output'].fillna({}).values, - sep='_' - ).add_prefix('output_') - - # Handle evaluations - flatten the dictionary - evaluations_df = pd.DataFrame(df['evaluations'].values.tolist()) - if not evaluations_df.empty: - evaluations_df = evaluations_df.astype(object) # Ensure columns are of object type - evaluations_df = evaluations_df.add_prefix('evaluations_') - # Replace NaN with None - evaluations_df = evaluations_df.where(pd.notna(evaluations_df), None) - - # Handle metadata - flatten the dictionary - metadata_df = pd.DataFrame(df['metadata'].values.tolist()) - if not metadata_df.empty: - metadata_df = metadata_df.add_prefix('metadata_') - - # Handle config if it exists - if 'config' in df.columns: - config_df = pd.json_normalize( - df['config'].fillna({}).values, - sep='_' - ).add_prefix('config_') - else: - config_df = pd.DataFrame() + # Handle special fields (input, output, expected_output) + for field in special_fields: + if field not in df.columns: + continue + + # Get the first non-null value to check type + first_value = next((v for v in df[field] if v is not None), None) + + if isinstance(first_value, dict): + # For dictionary values, expand into columns + field_df = pd.json_normalize(df[field].values) + else: + # For simple values, use 'value' as the subcolumn + field_df = pd.DataFrame({'value': df[field].values}) + + # Create MultiIndex columns for this field + field_df.columns = pd.MultiIndex.from_tuples([(field, col) for col in field_df.columns]) + result_dfs.append(field_df) - # Handle error column - flatten the dictionary and preserve None values - error_dicts = df['error'].values.tolist() - error_df = pd.DataFrame(error_dicts) - if not error_df.empty: - error_df = error_df.add_prefix('error_') + # Add all other columns as-is + other_cols = [col for col in df.columns if col not in special_fields] + if other_cols: + other_df = df[other_cols] + result_dfs.append(other_df) # Combine all DataFrames - result_dfs = [ - input_df, - expected_output_df, - output_df, - evaluations_df, - metadata_df, - config_df, - error_df - ] - - # Filter out empty DataFrames and concatenate - result_dfs = [df for df in result_dfs if not df.empty] final_df = pd.concat(result_dfs, axis=1) # Replace NaN with None final_df = final_df.where(pd.notna(final_df), None) - # Create MultiIndex columns - new_columns = pd.MultiIndex.from_tuples([ - tuple(col.split('_', 1)) if '_' in col else (col, '') - for col in final_df.columns - ]) - final_df.columns = new_columns - - # Replace NaN with None for the entire DataFrame - final_df = final_df.where(pd.notna(final_df), None) - return final_df def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite @@ -973,31 +937,31 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite spans.append(span) # Add evaluation metrics - for metric_name, metric_value in evaluations.items(): + for metric_payload_name, metric_payload_value in evaluations.items(): # Skip None values - if metric_value is None: - print(f"Skipping None value for metric: {metric_name}") + if metric_payload_value is None: + print(f"Skipping None value for metric: {metric_payload_name}") continue timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) # Check for bool first, since bool is a subclass of int - if isinstance(metric_value, (bool, str)): + if isinstance(metric_payload_value["value"], (bool, str)): metric_type = "categorical" - metric_value = str(metric_value).lower() - elif isinstance(metric_value, (int, float)): + metric_value = str(metric_payload_value["value"]).lower() + elif isinstance(metric_payload_value["value"], (int, float)): metric_type = "score" else: - print(f"Unknown metric type: {type(metric_value)}") metric_type = "categorical" - metric_value = str(metric_value) + metric_value = str(metric_payload_value["value"]) metric = { "span_id": span["span_id"], "metric_type": metric_type, "timestamp_ms": timestamp_ms, - "label": metric_name, + "label": metric_payload_name, "score_value" if metric_type == "score" else "categorical_value": metric_value, + "error": metric_payload_value["error"], } metrics.append(metric) From 2a73462d4d880d9fd6d372af3f73d9bf534fec2f Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Wed, 15 Jan 2025 12:48:14 -0500 Subject: [PATCH 31/36] encode llm events in utf-8 --- ddtrace/llmobs/_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 5880019d67f..e1dd9280ff7 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -210,6 +210,8 @@ def encode(self): data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events} try: enc_llm_events = safe_json(data) + if isinstance(enc_llm_events, str): + enc_llm_events = enc_llm_events.encode('utf-8') logger.debug("encode %d LLMObs span events to be sent", len(events)) except TypeError: logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True) From 9497ea8717823b639ce338eda45075d90119005f Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Wed, 15 Jan 2025 12:51:01 -0500 Subject: [PATCH 32/36] tracing works --- ddtrace/llmobs/__init__.py | 2 +- ddtrace/llmobs/_experiments.py | 264 ++++++++++++++++++++------------- 2 files changed, 166 insertions(+), 100 deletions(-) diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py index f2e91e8a1ca..26bc6f964b3 100644 --- a/ddtrace/llmobs/__init__.py +++ b/ddtrace/llmobs/__init__.py @@ -5,12 +5,12 @@ from ddtrace.llmobs import LLMObs LLMObs.enable() """ +from ._llmobs import LLMObs from ._experiments import Dataset from ._experiments import Experiment from ._experiments import task from ._experiments import evaluator -from ._llmobs import LLMObs __all__ = ["LLMObs", "Dataset", "Experiment", "task", "evaluator"] diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 72128e04fa8..b9dfc360e64 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -15,7 +15,10 @@ from ._utils import HTTPResponse from ._utils import http_request -from decorators import agent +from .decorators import agent +from ._llmobs import LLMObs + +from ddtrace.context import Context import ddtrace @@ -28,6 +31,14 @@ class FileType(Enum): CSV = 'csv' +LLMObs.enable( + ml_app="experiment-jonathan", + integrations_enabled=True, + agentless_enabled=True, + site="datadoghq.com", + api_key=os.getenv("DD_API_KEY"), +) + class Dataset: """A container for LLM experiment data that can be pushed to and retrieved from Datadog. @@ -40,21 +51,29 @@ class Dataset: description (str): Optional description of the dataset """ - def __init__(self, name: str, data: List[Dict[str, Union[str, Dict[str, Any]]]], description: str = "") -> None: + def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str, Any]]]]] = None, description: str = "") -> None: """ Args: name: Name of the dataset data: List of dictionaries where 'input' and 'expected_output' values can be - either strings or dictionaries of strings + either strings or dictionaries of strings. If None, attempts to pull from Datadog. description: Optional description of the dataset """ self.name = name self.description = description - self._validate_data(data) - self._data = data - # Post-push attributes - self._datadog_dataset_id = None + # If no data provided, attempt to pull from Datadog + if data is None: + print( + f"No data provided, pulling dataset '{name}' from Datadog..." + ) + pulled_dataset = self.pull(name) + self._data = pulled_dataset._data + self._datadog_dataset_id = pulled_dataset._datadog_dataset_id + else: + self._validate_data(data) + self._data = data + self._datadog_dataset_id = None def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]: return iter(self._data) @@ -151,9 +170,12 @@ def pull(cls, name: str) -> "Dataset": dataset._datadog_dataset_id = dataset_id return dataset - def push(self) -> None: + def push(self, chunk_size: int = 300) -> None: """Push the dataset to Datadog. + Args: + chunk_size: Number of records to upload in each chunk. Defaults to 300. + Returns: Dict[str, Any]: Dictionary containing dataset information including: - dataset_id: The ID of the created/updated dataset @@ -192,14 +214,27 @@ def push(self) -> None: "Please use a different name for your dataset." ) - # Add records to the dataset - records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}} - url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" - resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8")) - data = resp.json() + # Split records into chunks and upload + total_records = len(self._data) + chunks = [self._data[i:i + chunk_size] for i in range(0, total_records, chunk_size)] + total_chunks = len(chunks) + + # Only show progress bar for large datasets + show_progress = total_records > chunk_size + if show_progress: + print(f"\nUploading {total_records} records in {total_chunks} chunks...") + _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete') + + for i, chunk in enumerate(chunks): + records_payload = {"data": {"type": "datasets", "attributes": {"records": chunk}}} + url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" + resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8")) + + if show_progress: + _print_progress_bar(i + 1, total_chunks, prefix='Uploading:', suffix='Complete') # Print url to the dataset in Datadog - print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n\n") + print(f"\nDataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n") @classmethod def from_csv( @@ -485,10 +520,14 @@ def run_task( errors in the output. Defaults to False. Raises: - ValueError: If _jobs is not between 1 and 20 + ValueError: If _jobs is not between 1 and 30 """ - if not 1 <= _jobs <= 20: - raise ValueError("Number of jobs must be between 1 and 20") + if not 1 <= _jobs <= 30: + raise ValueError("Number of jobs must be between 1 and 30") + + @agent + def instrumented_task(input_data, config=None): # To trace the task + return self.task(input_data, config) self.outputs = [] total_rows = len(self.dataset) @@ -498,14 +537,20 @@ def run_task( def process_row(idx_row): idx, row = idx_row start_time = time.time() + ddtrace.tracer.context_provider.activate(Context()) + try: input_data = row['input'] if getattr(self.task, '_accepts_config', False): - output = self.task(input_data, self.config) + output = instrumented_task(input_data, self.config) else: - output = self.task(input_data) - + output = instrumented_task(input_data) + + # Periodic flush every 10 rows (approximate because it's concurrent) + if idx % 10 == 0: + LLMObs.flush() + output_data = { "idx": idx, "output": output, @@ -560,6 +605,7 @@ def process_row(idx_row): if raise_errors and output_data['error']['message']: error_message = output_data['error']['message'] raise ExperimentTaskError(error_message, idx, output_data['error']['type']) + elif output_data['error']['message']: error_count += 1 @@ -597,6 +643,9 @@ def process_row(idx_row): self.outputs = outputs_buffer self.has_run = True + + # Final flush at the end + LLMObs.flush() error_rate = (error_count / total_rows) * 100 print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") @@ -832,9 +881,12 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": return final_df - def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite + def push(self, chunk_size: int = 300) -> None: """Push the experiment results to Datadog. + Args: + chunk_size: Number of records to upload in each chunk. Defaults to 300. + Raises: ValueError: If the dataset hasn't been pushed to Datadog first """ @@ -896,93 +948,107 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite experiment_id = response_data["data"]["id"] self.experiment.name = response_data["data"]["attributes"]["name"] - spans = [] - metrics = [] - for result in self.merged_results: - idx = result['idx'] - merged_result = result - output = merged_result.get('output') - input = merged_result.get('input', {}) - evaluations = merged_result.get('evaluations', {}) - expected_output = merged_result.get('expected_output', {}) - metadata = merged_result.get('metadata', {}) - error = merged_result.get('error', {}) - - # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id - dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest() - - span = { - "span_id": _make_id(), - "project_id": project_id, - "experiment_id": experiment_id, - "dataset_id": self.experiment.dataset._datadog_dataset_id, - #TODO: Extract the record id from the dataset for hosted datasets - "dataset_record_id": dataset_record_id, - "start_ns": int(metadata.get("timestamp", time.time()) * 1e9), - "duration": float(metadata.get("duration", 0) * 1e9), - "status": "ok" if not error else "error", - "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans - "meta": { - "span": {"kind": "experiment"}, - "input": merged_result.get('input', {}), - "output": output, - "expected_output": merged_result.get('expected_output', {}), - "error": { - "message": error.get("message"), - "type": error.get("type"), - "stack": error.get("stack"), - } - }, - } - spans.append(span) - - # Add evaluation metrics - for metric_payload_name, metric_payload_value in evaluations.items(): - # Skip None values - if metric_payload_value is None: - print(f"Skipping None value for metric: {metric_payload_name}") - continue - - timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) - - # Check for bool first, since bool is a subclass of int - if isinstance(metric_payload_value["value"], (bool, str)): - metric_type = "categorical" - metric_value = str(metric_payload_value["value"]).lower() - elif isinstance(metric_payload_value["value"], (int, float)): - metric_type = "score" - else: - metric_type = "categorical" - metric_value = str(metric_payload_value["value"]) - - metric = { - "span_id": span["span_id"], - "metric_type": metric_type, - "timestamp_ms": timestamp_ms, - "label": metric_payload_name, - "score_value" if metric_type == "score" else "categorical_value": metric_value, - "error": metric_payload_value["error"], - } + # Process results in chunks + total_results = len(self.merged_results) + chunks = [self.merged_results[i:i + chunk_size] for i in range(0, total_results, chunk_size)] + total_chunks = len(chunks) - metrics.append(metric) + # Only show progress bar for large result sets + show_progress = total_results > chunk_size + if show_progress: + print(f"\nUploading {total_results} results in {total_chunks} chunks...") + _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete') + for chunk_idx, chunk in enumerate(chunks): + spans = [] + metrics = [] + + # Process each result in the chunk + for result in chunk: + idx = result['idx'] + merged_result = result + output = merged_result.get('output') + input = merged_result.get('input', {}) + evaluations = merged_result.get('evaluations', {}) + expected_output = merged_result.get('expected_output', {}) + metadata = merged_result.get('metadata', {}) + error = merged_result.get('error', {}) + + # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id + dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest() + + span = { + "span_id": _make_id(), + "project_id": project_id, + "experiment_id": experiment_id, + "dataset_id": self.experiment.dataset._datadog_dataset_id, + #TODO: Extract the record id from the dataset for hosted datasets + "dataset_record_id": dataset_record_id, + "start_ns": int(metadata.get("timestamp", time.time()) * 1e9), + "duration": float(metadata.get("duration", 0) * 1e9), + "status": "ok" if not error else "error", + "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans + "meta": { + "span": {"kind": "experiment"}, + "input": merged_result.get('input', {}), + "output": output, + "expected_output": merged_result.get('expected_output', {}), + "error": { + "message": error.get("message"), + "type": error.get("type"), + "stack": error.get("stack"), + } + }, + } + spans.append(span) + + # Add evaluation metrics + for metric_payload_name, metric_payload_value in evaluations.items(): + # Skip None values + if metric_payload_value is None: + print(f"Skipping None value for metric: {metric_payload_name}") + continue + + timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) + + # Check for bool first, since bool is a subclass of int + if isinstance(metric_payload_value["value"], (bool, str)): + metric_type = "categorical" + metric_value = str(metric_payload_value["value"]).lower() + elif isinstance(metric_payload_value["value"], (int, float)): + metric_type = "score" + else: + metric_type = "categorical" + metric_value = str(metric_payload_value["value"]) + + metric = { + "span_id": span["span_id"], + "metric_type": metric_type, + "timestamp_ms": timestamp_ms, + "label": metric_payload_name, + "score_value" if metric_type == "score" else "categorical_value": metric_value, + "error": metric_payload_value["error"], + } + metrics.append(metric) - # Prepare payload and send to Datadog - results_payload = { - "data": { - "type": "experiments", - "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__], - "attributes": {"spans": spans, "metrics": metrics}, + # Prepare and send chunk payload + chunk_payload = { + "data": { + "type": "experiments", + "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__], + "attributes": {"spans": spans, "metrics": metrics}, + } } - } + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" + exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8")) - url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" - exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8")) + if show_progress: + _print_progress_bar(chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete') # Print URL to the experiment in Datadog - print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id} \n\n") + print(f"\nExperiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}\n") def export_to_jsonl(self, file_path): """ From c05deb83d83c5a5a9ef2e08e31c4d80e7a7c3f02 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Mon, 27 Jan 2025 14:24:32 -0500 Subject: [PATCH 33/36] two buffers temporary --- .../contrib/internal/requests/connection.py | 4 + ddtrace/llmobs/_constants.py | 2 +- ddtrace/llmobs/_experiments.py | 45 ++++-- ddtrace/llmobs/_llmobs.py | 32 +++++ ddtrace/llmobs/_utils.py | 2 + ddtrace/llmobs/_writer.py | 129 +++++++++++++++--- 6 files changed, 187 insertions(+), 27 deletions(-) diff --git a/ddtrace/contrib/internal/requests/connection.py b/ddtrace/contrib/internal/requests/connection.py index 06d3347f0a1..d7a19ec6eb0 100644 --- a/ddtrace/contrib/internal/requests/connection.py +++ b/ddtrace/contrib/internal/requests/connection.py @@ -102,7 +102,11 @@ def _wrap_send(func, instance, args, kwargs): span.set_tag(_ANALYTICS_SAMPLE_RATE_KEY, cfg.get("analytics_sample_rate", True)) # propagate distributed tracing headers + # breakpoint() if cfg.get("distributed_tracing"): + # breakpoint() + print("propagating headers") + print(span.context) HTTPPropagator.inject(span.context, request.headers) response = response_headers = None diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 27000b36aac..05f3b599664 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -15,7 +15,7 @@ INPUT_VALUE = "_ml_obs.meta.input.value" INPUT_PARAMETERS = "_ml_obs.meta.input.parameters" INPUT_PROMPT = "_ml_obs.meta.input.prompt" - +EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output" OUTPUT_DOCUMENTS = "_ml_obs.meta.output.documents" OUTPUT_MESSAGES = "_ml_obs.meta.output.messages" OUTPUT_VALUE = "_ml_obs.meta.output.value" diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index b9dfc360e64..835855f5d5b 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -21,6 +21,9 @@ from ddtrace.context import Context import ddtrace +from ddtrace import patch_all + +patch_all() DD_SITE = os.getenv("DD_SITE", "datadoghq.com") if DD_SITE == "datadoghq.com": @@ -35,11 +38,12 @@ class FileType(Enum): ml_app="experiment-jonathan", integrations_enabled=True, agentless_enabled=True, - site="datadoghq.com", + site=os.getenv("DD_SITE"), api_key=os.getenv("DD_API_KEY"), ) + class Dataset: """A container for LLM experiment data that can be pushed to and retrieved from Datadog. @@ -61,6 +65,8 @@ def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str, """ self.name = name self.description = description + self.version = 0 + # If no data provided, attempt to pull from Datadog if data is None: @@ -70,10 +76,12 @@ def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str, pulled_dataset = self.pull(name) self._data = pulled_dataset._data self._datadog_dataset_id = pulled_dataset._datadog_dataset_id + self._version = pulled_dataset._datadog_dataset_version else: self._validate_data(data) self._data = data self._datadog_dataset_id = None + self._version = 0 def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]: return iter(self._data) @@ -143,6 +151,8 @@ def pull(cls, name: str) -> "Dataset": raise ValueError(f"Dataset '{name}' not found") dataset_id = datasets[0]["id"] + dataset_version = datasets[0]["attributes"]["current_version"] + # Get dataset records url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records" @@ -168,6 +178,7 @@ def pull(cls, name: str) -> "Dataset": # Create new dataset instance dataset = cls(name, class_records) dataset._datadog_dataset_id = dataset_id + dataset._datadog_dataset_version = dataset_version return dataset def push(self, chunk_size: int = 300) -> None: @@ -207,6 +218,7 @@ def push(self, chunk_size: int = 300) -> None: response_data = resp.json() dataset_id = response_data["data"]["id"] self._datadog_dataset_id = dataset_id + self._datadog_dataset_version = 0 else: # Dataset exists, raise error raise ValueError( @@ -522,12 +534,17 @@ def run_task( Raises: ValueError: If _jobs is not between 1 and 30 """ + os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True" if not 1 <= _jobs <= 30: raise ValueError("Number of jobs must be between 1 and 30") - @agent - def instrumented_task(input_data, config=None): # To trace the task - return self.task(input_data, config) + def instrumented_task(input_data, expected_output, config=None): + with LLMObs._experiment(name="experiment-task") as span: + span.context.set_baggage_item("is_experiment_task", True) + output = self.task(input_data, config) + # LLMObs._tag_expected_output(span, expected_output) + LLMObs.annotate(span, input_data=input_data, output_data=output) + return output self.outputs = [] total_rows = len(self.dataset) @@ -537,15 +554,15 @@ def instrumented_task(input_data, config=None): # To trace the task def process_row(idx_row): idx, row = idx_row start_time = time.time() - ddtrace.tracer.context_provider.activate(Context()) try: input_data = row['input'] + expected_output = row['expected_output'] if getattr(self.task, '_accepts_config', False): - output = instrumented_task(input_data, self.config) + output = instrumented_task(input_data, expected_output, self.config) else: - output = instrumented_task(input_data) + output = instrumented_task(input_data, expected_output) # Periodic flush every 10 rows (approximate because it's concurrent) if idx % 10 == 0: @@ -648,6 +665,8 @@ def process_row(idx_row): LLMObs.flush() error_rate = (error_count / total_rows) * 100 + os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "False" + os.environ["DD_LLMOBS_ENABLED"] = "False" print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") if error_count > 0: print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.") @@ -932,6 +951,7 @@ def push(self, chunk_size: int = 300) -> None: "description": self.experiment.description, "dataset_id": self.experiment.dataset._datadog_dataset_id, "project_id": project_id, + "dataset_version": self.experiment.dataset._datadog_dataset_version, "metadata": { "tags": self.experiment.tags, **(self.experiment.metadata or {}), @@ -989,7 +1009,7 @@ def push(self, chunk_size: int = 300) -> None: "status": "ok" if not error else "error", "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans "meta": { - "span": {"kind": "experiment"}, + "span": {"kind": "experiment-result"}, "input": merged_result.get('input', {}), "output": output, "expected_output": merged_result.get('expected_output', {}), @@ -1011,16 +1031,21 @@ def push(self, chunk_size: int = 300) -> None: timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000) + if metric_payload_value["value"] == None: + metric_type = "categorical" + metric_value = None # Check for bool first, since bool is a subclass of int - if isinstance(metric_payload_value["value"], (bool, str)): + elif isinstance(metric_payload_value["value"], (bool, str)): metric_type = "categorical" metric_value = str(metric_payload_value["value"]).lower() elif isinstance(metric_payload_value["value"], (int, float)): metric_type = "score" + metric_value = metric_payload_value["value"] else: metric_type = "categorical" metric_value = str(metric_payload_value["value"]) + metric = { "span_id": span["span_id"], "metric_type": metric_type, @@ -1037,7 +1062,7 @@ def push(self, chunk_size: int = 300) -> None: "data": { "type": "experiments", "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__], - "attributes": {"spans": spans, "metrics": metrics}, + "attributes": {"spans": [], "metrics": []} #metrics}, #TODO: Remove this whole thing since experiment spans results will be part of tracing } } diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index b4f1dc1b2f6..0da238f3d0d 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -52,6 +52,7 @@ from ddtrace.llmobs._constants import SPAN_KIND from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS +from ddtrace.llmobs._constants import EXPECTED_OUTPUT from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from ddtrace.llmobs._utils import AnnotationContext from ddtrace.llmobs._utils import _get_llmobs_parent_id @@ -193,6 +194,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") + if span._get_ctx_item(EXPECTED_OUTPUT) is not None: + meta["expected_output"] = span._get_ctx_item(EXPECTED_OUTPUT) + llmobs_span_event = { "trace_id": "{:x}".format(span.trace_id), "span_id": str(span.span_id), @@ -626,6 +630,22 @@ def agent(cls, name: Optional[str] = None, session_id: Optional[str] = None, ml_ if cls.enabled is False: log.warning(SPAN_START_WHILE_DISABLED_WARNING) return cls._instance._start_span("agent", name=name, session_id=session_id, ml_app=ml_app) + + @classmethod + def _experiment(cls, name: Optional[str] = None, session_id: Optional[str] = None, ml_app: Optional[str] = None) -> Span: + """ + Trace a dynamic workflow in which an embedded language model (agent) decides what sequence of actions to take. + + :param str name: The name of the traced operation. If not provided, a default value of "agent" will be set. + :param str session_id: The ID of the underlying user session. Required for tracking sessions. + :param str ml_app: The name of the ML application that the agent is orchestrating. If not provided, the default + value will be set to the value of `DD_LLMOBS_ML_APP`. + + :returns: The Span object representing the traced operation. + """ + if cls.enabled is False: + log.warning(SPAN_START_WHILE_DISABLED_WARNING) + return cls._instance._start_span("experiment", name=name, session_id=session_id, ml_app=ml_app) @classmethod def workflow( @@ -788,6 +808,18 @@ def annotate( else: cls._tag_text_io(span, input_value=input_data, output_value=output_data) + @staticmethod + def _tag_expected_output(span, expected_output: dict) -> None: + """Tags a given LLMObs span with a prompt""" + try: + span._set_ctx_item(EXPECTED_OUTPUT, expected_output) + print("added expected output") + print("expected output: ", span._get_ctx_item(EXPECTED_OUTPUT)) + print("span: ", span) + except TypeError: + log.warning("Failed to validate expected output with error: ", exc_info=True) + return + @staticmethod def _tag_prompt(span, prompt: dict) -> None: """Tags a given LLMObs span with a prompt""" diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index 827603cc93d..74751944621 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -167,6 +167,7 @@ def _get_session_id(span: Span) -> Optional[str]: def _inject_llmobs_parent_id(span_context): """Inject the LLMObs parent ID into the span context for reconnecting distributed LLMObs traces.""" span = ddtrace.tracer.current_span() + if span is None: log.warning("No active span to inject LLMObs parent ID info.") return @@ -178,6 +179,7 @@ def _inject_llmobs_parent_id(span_context): llmobs_parent_id = str(span.span_id) else: llmobs_parent_id = _get_llmobs_parent_id(span) + span_context._meta[PROPAGATED_PARENT_ID_KEY] = llmobs_parent_id or "undefined" diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index e1dd9280ff7..b8a2756d22c 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -32,6 +32,8 @@ from ddtrace.llmobs._constants import EVP_SUBDOMAIN_HEADER_NAME from ddtrace.llmobs._constants import EVP_SUBDOMAIN_HEADER_VALUE from ddtrace.llmobs._utils import safe_json +from ddtrace.internal.utils.formats import asbool +import os logger = get_logger(__name__) @@ -188,35 +190,101 @@ def __len__(self): def _init_buffer(self): with self._lock: self._buffer = [] + self._experiment_buffer = [] self.buffer_size = 0 def put(self, events: List[LLMObsSpanEvent]): - # events always has only 1 event - with List type to be compatible with HTTPWriter interfaces - with self._lock: - if len(self._buffer) >= self._buffer_limit: - logger.warning( - "%r event buffer full (limit is %d), dropping event", self.__class__.__name__, self._buffer_limit - ) - return - self._buffer.extend(events) - self.buffer_size += len(safe_json(events)) + # Split incoming events into normal vs experiment spans + norm_events = [] + exp_events = [] + for e in events: + if e.get("meta", {}).get("span.kind") == "experiment": + exp_events.append(e) + else: + norm_events.append(e) + + # Add normal spans to main buffer + if norm_events: + with self._lock: + if len(self._buffer) + len(norm_events) > self._buffer_limit: + logger.warning("Dropping normal spans: buffer limit reached") + return + self._buffer.extend(norm_events) + self.buffer_size += len(safe_json(norm_events)) + + # Add experiment spans to separate buffer + if exp_events: + with self._lock: + if len(self._experiment_buffer) + len(exp_events) > self._buffer_limit: + logger.warning("Dropping experiment spans: buffer limit reached") + return + self._experiment_buffer.extend(exp_events) + self.buffer_size += len(safe_json(exp_events)) def encode(self): + """Encode only the normal spans for standard flush""" with self._lock: if not self._buffer: return None, 0 events = self._buffer + + # Save experiment buffer before _init_buffer() clears it + experiment_spans = self._experiment_buffer self._init_buffer() - data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events} + + data = { + "_dd.stage": "raw", + "_dd.tracer_version": ddtrace.__version__, + "event_type": "span", + "spans": events + } + + if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")): + data["_dd.scope"] = "experiments" + + try: + enc_data = safe_json(data) + if isinstance(enc_data, str): + enc_data = enc_data.encode('utf-8') + logger.debug("encode %d LLMObs span events", len(events)) + except TypeError: + logger.error("failed to encode LLMObs span events", exc_info=True) + return None, 0 + + # Restore experiment buffer + with self._lock: + self._experiment_buffer = experiment_spans + + return enc_data, len(events) + + def encode_experiment_spans(self): + """Encode only the experiment spans for separate request""" + with self._lock: + if not self._experiment_buffer: + return None, 0 + exp_events = self._experiment_buffer + self._experiment_buffer = [] + + data = { + "_dd.stage": "raw", + "_dd.tracer_version": ddtrace.__version__, + "event_type": "experiment-span", + "experiment_spans": exp_events + } + + if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")): + data["_dd.scope"] = "experiments" + try: - enc_llm_events = safe_json(data) - if isinstance(enc_llm_events, str): - enc_llm_events = enc_llm_events.encode('utf-8') - logger.debug("encode %d LLMObs span events to be sent", len(events)) + enc_data = safe_json(data) + if isinstance(enc_data, str): + enc_data = enc_data.encode('utf-8') + logger.debug("encode %d LLMObs experiment span events", len(exp_events)) except TypeError: - logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True) + logger.error("failed to encode LLMObs experiment span events", exc_info=True) return None, 0 - return enc_llm_events, len(events) + + return enc_data, len(exp_events) class LLMObsEventClient(WriterClientBase): @@ -310,6 +378,35 @@ def recreate(self): is_agentless=config._llmobs_agentless_enabled, ) + def periodic(self) -> None: + # First flush normal spans using parent logic + super(LLMObsSpanWriter, self).periodic() + + # Then flush experiment spans in a separate request + for client in self._clients: + if isinstance(client, LLMObsEventClient) and isinstance(client.encoder, LLMObsSpanEncoder): + encoded, count = client.encoder.encode_experiment_spans() + if not encoded or not count: + continue + + try: + print("Sending experiment spans") + print(encoded) + self._send_payload_with_backoff(encoded, count, client) + except Exception: + self._metrics_dist("http.errors", tags=["type:err"]) + self._metrics_dist("http.dropped.bytes", len(encoded)) + self._metrics_dist("http.dropped.traces", count) + logger.error( + "failed to send %d experiment spans to %s", + count, + self.intake_url, + exc_info=True + ) + else: + self._metrics_dist("http.sent.bytes", len(encoded)) + self._metrics_dist("http.sent.traces", count) + def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent: event["meta"]["input"] = {"value": DROPPED_VALUE_TEXT} From 1b0800b3817974324f3efc4cc18e1014002bd194 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Thu, 30 Jan 2025 12:14:06 -0500 Subject: [PATCH 34/36] switch trace ingestion path --- ddtrace/llmobs/_experiments.py | 488 +++++++++++++++++++-------------- ddtrace/llmobs/_llmobs.py | 3 - ddtrace/llmobs/_writer.py | 2 - 3 files changed, 281 insertions(+), 212 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 835855f5d5b..d6d72284a6f 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -170,6 +170,7 @@ def pull(cls, name: str) -> "Dataset": expected_output = attrs.get("expected_output") class_records.append({ + "record_id": record.get("id"), "input": input_data, "expected_output": expected_output, **attrs.get("metadata", {}), @@ -182,16 +183,10 @@ def pull(cls, name: str) -> "Dataset": return dataset def push(self, chunk_size: int = 300) -> None: - """Push the dataset to Datadog. + """Push the dataset to Datadog and refresh with pulled data. Args: chunk_size: Number of records to upload in each chunk. Defaults to 300. - - Returns: - Dict[str, Any]: Dictionary containing dataset information including: - - dataset_id: The ID of the created/updated dataset - - dataset_name: The name of the dataset - - record_count: Number of records uploaded """ # Check if dataset exists encoded_name = quote(self.name) @@ -245,6 +240,12 @@ def push(self, chunk_size: int = 300) -> None: if show_progress: _print_progress_bar(i + 1, total_chunks, prefix='Uploading:', suffix='Complete') + # Pull the dataset to get all record IDs and metadata + pulled_dataset = self.pull(self.name) + self._data = pulled_dataset._data + self._datadog_dataset_id = pulled_dataset._datadog_dataset_id + self._datadog_dataset_version = pulled_dataset._datadog_dataset_version + # Print url to the dataset in Datadog print(f"\nDataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n") @@ -504,14 +505,16 @@ def __init__( self.metadata = metadata self.config = config - # Enforce that the task function has the @task decorator - if not hasattr(self.task, '_is_task'): + # Make sure the task is decorated with @task + if not hasattr(self.task, "_is_task"): raise TypeError("Task function must be decorated with @task decorator.") - # Enforce that all evaluators have the @evaluator decorator + # Make sure every evaluator is decorated with @evaluator for evaluator_func in self.evaluators: - if not hasattr(evaluator_func, '_is_evaluator'): - raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.") + if not hasattr(evaluator_func, "_is_evaluator"): + raise TypeError( + f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator." + ) # Post-run attributes self.has_run = False @@ -519,98 +522,228 @@ def __init__( self.outputs = [] self.evaluations = [] - def run_task( + # We'll store the experiment's Datadog ID once it's created. + self._datadog_experiment_id: Optional[str] = None + self._datadog_project_id: Optional[str] = None + + def _get_or_create_project(self) -> str: + """ + Internal helper to retrieve or create a project in Datadog, returning the project_id. + """ + url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.project_name}" + resp = exp_http_request("GET", url) + response_data = resp.json() + projects = response_data.get("data", []) + + if not projects: + # Create new project + project_payload = { + "data": { + "type": "projects", + "attributes": { + "name": self.project_name, + "description": "", + "metadata": {"team": "ml-obs"}, + }, + } + } + resp = exp_http_request( + "POST", + "/api/unstable/llm-obs/v1/projects", + body=json.dumps(project_payload).encode("utf-8"), + ) + response_data = resp.json() + return response_data["data"]["id"] + else: + return projects[0]["id"] + + def _create_experiment_in_datadog(self) -> str: + """ + Internal helper to create an experiment in Datadog, returning the new experiment_id. + Raises ValueError if the dataset hasn't been pushed (no _datadog_dataset_id). + """ + if not self.dataset._datadog_dataset_id: + raise ValueError( + "Dataset must be pushed to Datadog (so it has an ID) before creating an experiment. " + "Please call dataset.push() first." + ) + + project_id = self._get_or_create_project() + + experiment_payload = { + "data": { + "type": "experiments", + "attributes": { + "name": self.name, + "description": self.description, + "dataset_id": self.dataset._datadog_dataset_id, + "project_id": project_id, + "dataset_version": self.dataset._datadog_dataset_version, + "metadata": { + "tags": self.tags, + **(self.metadata or {}), + "config": self.config, + }, + "ensure_unique": True, + }, + } + } + resp = exp_http_request( + "POST", + "/api/unstable/llm-obs/v1/experiments", + body=json.dumps(experiment_payload).encode("utf-8"), + ) + response_data = resp.json() + experiment_id = response_data["data"]["id"] + + # The API may rename the experiment (e.g., adding a suffix), so update local name: + self.name = response_data["data"]["attributes"]["name"] + return experiment_id + + def run( self, _jobs: int = 10, raise_errors: bool = False, - ) -> None: - """Execute the task function on the dataset and store the outputs. + ) -> "ExperimentResults": + """ + Execute the task and evaluations, returning the results. + Here, we guarantee an experiment is created first, + so run_task() can tag traces with the real experiment ID. + """ + print("Running experiment...") + # 1) Make sure the dataset is pushed + if not self.dataset._datadog_dataset_id: + raise ValueError( + "Dataset must be pushed to Datadog before running the experiment." + ) - Args: - _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10. - raise_errors: If True, raises exceptions from failed tasks. If False, stores - errors in the output. Defaults to False. + # 2) Create project + experiment if this hasn't been done yet + if not self._datadog_experiment_id: + project_id = self._get_or_create_project() # your existing helper + self._datadog_project_id = project_id + + experiment_id = self._create_experiment_in_datadog() # your existing helper + self._datadog_experiment_id = experiment_id - Raises: - ValueError: If _jobs is not between 1 and 30 + # 3) Now run the task and evaluations + self.run_task(_jobs=_jobs, raise_errors=raise_errors) + experiment_results = self.run_evaluations(raise_errors=raise_errors) + return experiment_results + + def run_task( + self, + _jobs: int = 10, + raise_errors: bool = False, + ) -> None: + """ + Execute the task function on the dataset and store the outputs. + The caller (run()) ensures that self._datadog_experiment_id is set first. """ os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True" if not 1 <= _jobs <= 30: raise ValueError("Number of jobs must be between 1 and 30") - - def instrumented_task(input_data, expected_output, config=None): + + def instrumented_task( + record_id: str, input_data: Any, expected_output: Any, config: Optional[Dict[str, Any]] = None + ): with LLMObs._experiment(name="experiment-task") as span: span.context.set_baggage_item("is_experiment_task", True) output = self.task(input_data, config) - # LLMObs._tag_expected_output(span, expected_output) - LLMObs.annotate(span, input_data=input_data, output_data=output) - return output - + LLMObs.annotate( + span, + input_data=input_data, + output_data=output, + tags={ + "dataset_id": self.dataset._datadog_dataset_id, + "dataset_record_id": record_id, + "experiment_id": self._datadog_experiment_id, + + }, + ) + LLMObs._tag_expected_output(span, expected_output) + return (output, span) + self.outputs = [] total_rows = len(self.dataset) completed = 0 - error_count = 0 + error_count = 0 def process_row(idx_row): idx, row = idx_row start_time = time.time() - - try: - input_data = row['input'] - expected_output = row['expected_output'] - - if getattr(self.task, '_accepts_config', False): - output = instrumented_task(input_data, expected_output, self.config) - else: - output = instrumented_task(input_data, expected_output) - - # Periodic flush every 10 rows (approximate because it's concurrent) - if idx % 10 == 0: - LLMObs.flush() - - output_data = { - "idx": idx, - "output": output, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": { - "message": None, - "stack": None, - "type": None, + + with LLMObs._experiment(name="experiment-task") as span: + span.context.set_baggage_item("is_experiment_task", True) + try: + input_data = row["input"] + expected_output = row["expected_output"] + + if getattr(self.task, "_accepts_config", False): + output = self.task(input_data, self.config) + else: + output = self.task(input_data) + + # Periodic flush for concurrency + if idx % 10 == 0: + LLMObs.flush() + + LLMObs.annotate( + span, + input_data=input_data, + output_data=output, + tags={ + "dataset_id": self.dataset._datadog_dataset_id, + "dataset_record_id": row["record_id"], + "experiment_id": self._datadog_experiment_id, + }, + ) + LLMObs._tag_expected_output(span, expected_output) + + return { + "idx": idx, + "output": output, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_index": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + "span_id": span.span_id, + "trace_id": span.trace_id, + }, + "error": {"message": None, "stack": None, "type": None}, } - } - return output_data - except Exception as e: - error_message = str(e) - return { - "idx": idx, - "output": None, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_idx": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - }, - "error": { - "message": error_message, - "stack": traceback.format_exc(), - "type": type(e).__name__, + except Exception as e: + error_message = str(e) + return { + "idx": idx, + "output": None, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_index": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + "span_id": span.span_id, + "trace_id": span.trace_id, + }, + "error": { + "message": error_message, + "stack": traceback.format_exc(), + "type": type(e).__name__, + } } - } _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)} + futures = { + executor.submit(process_row, (idx, row)): idx + for idx, row in enumerate(self.dataset) + } outputs_buffer = [None] * total_rows try: @@ -619,11 +752,13 @@ def process_row(idx_row): try: output_data = future.result() outputs_buffer[idx] = output_data - if raise_errors and output_data['error']['message']: - error_message = output_data['error']['message'] - raise ExperimentTaskError(error_message, idx, output_data['error']['type']) - - elif output_data['error']['message']: + if raise_errors and output_data["error"]["message"]: + error_message = output_data["error"]["message"] + raise ExperimentTaskError( + error_message, idx, output_data["error"]["type"] + ) + + elif output_data["error"]["message"]: error_count += 1 except Exception as e: @@ -633,16 +768,18 @@ def process_row(idx_row): "metadata": { "timestamp": time.time(), "duration": 0, - "dataset_record_idx": idx, + "dataset_record_index": idx, "project_name": self.project_name, "experiment_name": self.name, "dataset_name": self.dataset.name, + "span_id": span.span_id, + "trace_id": span.trace_id, }, "error": { "message": str(e), "stack": traceback.format_exc(), "type": type(e).__name__, - } + }, } if raise_errors: raise e @@ -650,7 +787,7 @@ def process_row(idx_row): error_count += 1 completed += 1 - _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete') + _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete") except Exception as e: for future in futures: @@ -660,8 +797,7 @@ def process_row(idx_row): self.outputs = outputs_buffer self.has_run = True - - # Final flush at the end + LLMObs.flush() error_rate = (error_count / total_rows) * 100 @@ -669,9 +805,16 @@ def process_row(idx_row): os.environ["DD_LLMOBS_ENABLED"] = "False" print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)") if error_count > 0: - print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.") + print( + "If you'd like to halt execution on errors and see the full traceback, " + "set `raise_errors=True` when running the experiment." + ) - def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults": + def run_evaluations( + self, + evaluators: Optional[List[Callable]] = None, + raise_errors: bool = False + ) -> "ExperimentResults": """Run evaluators on the outputs and return ExperimentResults. Args: @@ -753,26 +896,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err self.has_evaluated = True return ExperimentResults(self.dataset, self, self.outputs, evaluations) - def run( - self, - _jobs: int = 10, - raise_errors: bool = False, - ) -> "ExperimentResults": - """Execute the task and evaluations, returning the results. - - Args: - _jobs (int): Number of worker threads. - timeout (float, optional): Time limit for the task execution in seconds. - raise_errors (bool): If True, raises exceptions from failed tasks. If False, stores - errors in the output. Defaults to False. - - Returns: - ExperimentResults: The results of the experiment. - """ - self.run_task(_jobs=_jobs, raise_errors=raise_errors) - experiment_results = self.run_evaluations(raise_errors=raise_errors) - return experiment_results - + class ExperimentResults: """Contains and manages the results of an experiment run. @@ -808,6 +932,7 @@ def _merge_results(self) -> List[Dict[str, Any]]: merged_result = { "idx": idx, + "record_id": dataset_record.get('record_id'), "input": dataset_record.get('input', {}), "expected_output": dataset_record.get('expected_output', {}), "output": output_data.get('output'), @@ -901,80 +1026,43 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": return final_df def push(self, chunk_size: int = 300) -> None: - """Push the experiment results to Datadog. - - Args: - chunk_size: Number of records to upload in each chunk. Defaults to 300. - - Raises: - ValueError: If the dataset hasn't been pushed to Datadog first """ + Push the experiment results to Datadog, without re-creating the project/experiment. + Assumes self.experiment._datadog_experiment_id and self.experiment._datadog_project_id + have already been set in Experiment.run(). + """ + # Ensure the dataset is hosted in Datadog if not self.experiment.dataset._datadog_dataset_id: raise ValueError( "Dataset has not been pushed to Datadog. " "Please call dataset.push() before pushing experiment results." ) - # Check if project exists - url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}" - resp = exp_http_request("GET", url) - response_data = resp.json() - projects = response_data.get("data", []) - if not projects: - # Create new project - project_payload = { - "data": { - "type": "projects", - "attributes": { - "name": self.experiment.project_name, - "description": "", - "metadata": {"team": "ml-obs"}, - }, - } - } - resp = exp_http_request( - "POST", - "/api/unstable/llm-obs/v1/projects", - body=json.dumps(project_payload).encode("utf-8"), + # Ensure the experiment was already created (via run()) + if not self.experiment._datadog_experiment_id: + raise ValueError( + "Experiment has not been created in Datadog. " + "Please call experiment.run() before pushing results." ) - response_data = resp.json() - project_id = response_data["data"]["id"] - else: - project_id = projects[0]["id"] - # Create new experiment - experiment_payload = { - "data": { - "type": "experiments", - "attributes": { - "name": self.experiment.name, - "description": self.experiment.description, - "dataset_id": self.experiment.dataset._datadog_dataset_id, - "project_id": project_id, - "dataset_version": self.experiment.dataset._datadog_dataset_version, - "metadata": { - "tags": self.experiment.tags, - **(self.experiment.metadata or {}), - "config": self.experiment.config, - }, - "ensure_unique": True, # Generates a new experiment with a unique name if the experiment name already exists - }, - } - } - resp = exp_http_request( - "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8") - ) - response_data = resp.json() - experiment_id = response_data["data"]["id"] - self.experiment.name = response_data["data"]["attributes"]["name"] + # Grab IDs from the already-created experiment + experiment_id = self.experiment._datadog_experiment_id + project_id = self.experiment._datadog_project_id + experiment_name = self.experiment.name + + # Now proceed with chunked uploading of your results — no project or experiment creation here. - # Process results in chunks total_results = len(self.merged_results) - chunks = [self.merged_results[i:i + chunk_size] for i in range(0, total_results, chunk_size)] + # Optional progress bar + show_progress = total_results > chunk_size + + # Just an example of how you'd do chunked uploads: + chunks = [ + self.merged_results[i : i + chunk_size] + for i in range(0, total_results, chunk_size) + ] total_chunks = len(chunks) - # Only show progress bar for large result sets - show_progress = total_results > chunk_size if show_progress: print(f"\nUploading {total_results} results in {total_chunks} chunks...") _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete') @@ -988,39 +1076,15 @@ def push(self, chunk_size: int = 300) -> None: idx = result['idx'] merged_result = result output = merged_result.get('output') + record_id = merged_result.get('record_id') input = merged_result.get('input', {}) evaluations = merged_result.get('evaluations', {}) expected_output = merged_result.get('expected_output', {}) - metadata = merged_result.get('metadata', {}) error = merged_result.get('error', {}) + metadata = merged_result.get('metadata', {}) + span_id = metadata.get('span_id') + trace_id = metadata.get('trace_id') - # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id - dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest() - - span = { - "span_id": _make_id(), - "project_id": project_id, - "experiment_id": experiment_id, - "dataset_id": self.experiment.dataset._datadog_dataset_id, - #TODO: Extract the record id from the dataset for hosted datasets - "dataset_record_id": dataset_record_id, - "start_ns": int(metadata.get("timestamp", time.time()) * 1e9), - "duration": float(metadata.get("duration", 0) * 1e9), - "status": "ok" if not error else "error", - "metrics": {}, # TODO: Fill in with actual metrics once we have tracing and llm spans - "meta": { - "span": {"kind": "experiment-result"}, - "input": merged_result.get('input', {}), - "output": output, - "expected_output": merged_result.get('expected_output', {}), - "error": { - "message": error.get("message"), - "type": error.get("type"), - "stack": error.get("stack"), - } - }, - } - spans.append(span) # Add evaluation metrics for metric_payload_name, metric_payload_value in evaluations.items(): @@ -1047,33 +1111,43 @@ def push(self, chunk_size: int = 300) -> None: metric = { - "span_id": span["span_id"], + "span_id": str(span_id), + "trace_id": str(trace_id), "metric_type": metric_type, "timestamp_ms": timestamp_ms, "label": metric_payload_name, "score_value" if metric_type == "score" else "categorical_value": metric_value, "error": metric_payload_value["error"], + "join_on": { + "span": { + "trace_id": str(trace_id), + "span_id": str(span_id), + }, + } } metrics.append(metric) + # Prepare and send chunk payload chunk_payload = { "data": { - "type": "experiments", - "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__], - "attributes": {"spans": [], "metrics": []} #metrics}, #TODO: Remove this whole thing since experiment spans results will be part of tracing + "type": "evaluation_metric", + "attributes": {"scope": "experiments", "metrics": metrics, "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__, "experiment_id:" + experiment_id]}, } } - url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" + print("chunk_payload: ", chunk_payload) + + url = f"/api/intake/llm-obs/v2/eval-metric" exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8")) if show_progress: - _print_progress_bar(chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete') + _print_progress_bar( + chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete' + ) - # Print URL to the experiment in Datadog - print(f"\nExperiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}\n") + print(f"\nExperiment '{experiment_name}' results pushed to Datadog.\n") def export_to_jsonl(self, file_path): """ diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 0da238f3d0d..ef5d96ffa88 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -813,9 +813,6 @@ def _tag_expected_output(span, expected_output: dict) -> None: """Tags a given LLMObs span with a prompt""" try: span._set_ctx_item(EXPECTED_OUTPUT, expected_output) - print("added expected output") - print("expected output: ", span._get_ctx_item(EXPECTED_OUTPUT)) - print("span: ", span) except TypeError: log.warning("Failed to validate expected output with error: ", exc_info=True) return diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index b8a2756d22c..7f6be1d6fd4 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -390,8 +390,6 @@ def periodic(self) -> None: continue try: - print("Sending experiment spans") - print(encoded) self._send_payload_with_backoff(encoded, count, client) except Exception: self._metrics_dist("http.errors", tags=["type:err"]) From b8a247259c1c79e13848fedd99c01576fe230a90 Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Wed, 5 Feb 2025 16:46:09 -0500 Subject: [PATCH 35/36] tracing stable --- .../contrib/internal/requests/connection.py | 4 - ddtrace/llmobs/_constants.py | 2 + ddtrace/llmobs/_experiments.py | 194 ++++++------------ ddtrace/llmobs/_integrations/base.py | 3 +- ddtrace/llmobs/_llmobs.py | 19 ++ ddtrace/llmobs/_writer.py | 124 ++--------- 6 files changed, 104 insertions(+), 242 deletions(-) diff --git a/ddtrace/contrib/internal/requests/connection.py b/ddtrace/contrib/internal/requests/connection.py index d7a19ec6eb0..06d3347f0a1 100644 --- a/ddtrace/contrib/internal/requests/connection.py +++ b/ddtrace/contrib/internal/requests/connection.py @@ -102,11 +102,7 @@ def _wrap_send(func, instance, args, kwargs): span.set_tag(_ANALYTICS_SAMPLE_RATE_KEY, cfg.get("analytics_sample_rate", True)) # propagate distributed tracing headers - # breakpoint() if cfg.get("distributed_tracing"): - # breakpoint() - print("propagating headers") - print(span.context) HTTPPropagator.inject(span.context, request.headers) response = response_headers = None diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 05f3b599664..d060f8173f7 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -16,6 +16,8 @@ INPUT_PARAMETERS = "_ml_obs.meta.input.parameters" INPUT_PROMPT = "_ml_obs.meta.input.prompt" EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output" +EXPERIMENT_INPUT = "_ml_obs.meta.input" +EXPERIMENT_OUTPUT = "_ml_obs.meta.output" OUTPUT_DOCUMENTS = "_ml_obs.meta.output.documents" OUTPUT_MESSAGES = "_ml_obs.meta.output.messages" OUTPUT_VALUE = "_ml_obs.meta.output.value" diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index d6d72284a6f..335c0ff4e30 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -229,7 +229,6 @@ def push(self, chunk_size: int = 300) -> None: # Only show progress bar for large datasets show_progress = total_records > chunk_size if show_progress: - print(f"\nUploading {total_records} records in {total_chunks} chunks...") _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete') for i, chunk in enumerate(chunks): @@ -602,7 +601,7 @@ def _create_experiment_in_datadog(self) -> str: def run( self, - _jobs: int = 10, + jobs: int = 10, raise_errors: bool = False, ) -> "ExperimentResults": """ @@ -626,55 +625,24 @@ def run( self._datadog_experiment_id = experiment_id # 3) Now run the task and evaluations - self.run_task(_jobs=_jobs, raise_errors=raise_errors) + self.run_task(_jobs=jobs, raise_errors=raise_errors) experiment_results = self.run_evaluations(raise_errors=raise_errors) return experiment_results - def run_task( - self, - _jobs: int = 10, - raise_errors: bool = False, - ) -> None: + def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None: """ - Execute the task function on the dataset and store the outputs. - The caller (run()) ensures that self._datadog_experiment_id is set first. + Execute the task function on the dataset concurrently using ThreadPoolExecutor.map, + updating progress via _print_progress_bar and processing more rows in parallel. """ os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True" - if not 1 <= _jobs <= 30: - raise ValueError("Number of jobs must be between 1 and 30") - - def instrumented_task( - record_id: str, input_data: Any, expected_output: Any, config: Optional[Dict[str, Any]] = None - ): - with LLMObs._experiment(name="experiment-task") as span: - span.context.set_baggage_item("is_experiment_task", True) - output = self.task(input_data, config) - LLMObs.annotate( - span, - input_data=input_data, - output_data=output, - tags={ - "dataset_id": self.dataset._datadog_dataset_id, - "dataset_record_id": record_id, - "experiment_id": self._datadog_experiment_id, - - }, - ) - LLMObs._tag_expected_output(span, expected_output) - return (output, span) - - self.outputs = [] total_rows = len(self.dataset) - completed = 0 - error_count = 0 def process_row(idx_row): idx, row = idx_row start_time = time.time() - - with LLMObs._experiment(name="experiment-task") as span: - span.context.set_baggage_item("is_experiment_task", True) - try: + try: + with LLMObs._experiment(name=self.task.__name__) as span: + span.context.set_baggage_item("is_experiment_task", True) input_data = row["input"] expected_output = row["expected_output"] @@ -684,7 +652,7 @@ def process_row(idx_row): output = self.task(input_data) # Periodic flush for concurrency - if idx % 10 == 0: + if idx % 30 == 0: LLMObs.flush() LLMObs.annotate( @@ -699,6 +667,10 @@ def process_row(idx_row): ) LLMObs._tag_expected_output(span, expected_output) + span_context = LLMObs.export_span(span=span) + span_id = span_context["span_id"] + trace_id = span_context["trace_id"] + return { "idx": idx, "output": output, @@ -709,92 +681,56 @@ def process_row(idx_row): "project_name": self.project_name, "experiment_name": self.name, "dataset_name": self.dataset.name, - "span_id": span.span_id, - "trace_id": span.trace_id, + "span_id": span_id, + "trace_id": trace_id, }, "error": {"message": None, "stack": None, "type": None}, } - - except Exception as e: - error_message = str(e) - return { - "idx": idx, - "output": None, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_index": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - "span_id": span.span_id, - "trace_id": span.trace_id, - }, - "error": { - "message": error_message, - "stack": traceback.format_exc(), - "type": type(e).__name__, - } + except Exception as e: + error_message = str(e) + # In case of an exception, span_id and trace_id are set to None + return { + "idx": idx, + "output": None, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_index": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + "span_id": None, + "trace_id": None, + }, + "error": { + "message": error_message, + "stack": traceback.format_exc(), + "type": type(e).__name__, } + } - _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete') + outputs_buffer = [] + completed = 0 + # Using ThreadPoolExecutor.map to process rows concurrently with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: - futures = { - executor.submit(process_row, (idx, row)): idx - for idx, row in enumerate(self.dataset) - } - outputs_buffer = [None] * total_rows - - try: - for future in concurrent.futures.as_completed(futures): - idx = futures[future] - try: - output_data = future.result() - outputs_buffer[idx] = output_data - if raise_errors and output_data["error"]["message"]: - error_message = output_data["error"]["message"] - raise ExperimentTaskError( - error_message, idx, output_data["error"]["type"] - ) - - elif output_data["error"]["message"]: - error_count += 1 - - except Exception as e: - outputs_buffer[idx] = { - "idx": idx, - "output": None, - "metadata": { - "timestamp": time.time(), - "duration": 0, - "dataset_record_index": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - "span_id": span.span_id, - "trace_id": span.trace_id, - }, - "error": { - "message": str(e), - "stack": traceback.format_exc(), - "type": type(e).__name__, - }, - } - if raise_errors: - raise e - else: - error_count += 1 - - completed += 1 - _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete") - - except Exception as e: - for future in futures: - future.cancel() - executor.shutdown(wait=False) - raise e + # executor.map returns results in order, so we iterate and update our progress + for result in executor.map(process_row, list(enumerate(self.dataset))): + outputs_buffer.append(result) + completed += 1 + _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete") + # Check for errors and raise if required + error_count = 0 + for idx, output_data in enumerate(outputs_buffer): + if output_data["error"]["message"]: + error_count += 1 + if raise_errors: + raise ExperimentTaskError( + output_data["error"]["message"], + idx, + output_data["error"]["type"] + ) self.outputs = outputs_buffer self.has_run = True @@ -1110,6 +1046,7 @@ def push(self, chunk_size: int = 300) -> None: metric_value = str(metric_payload_value["value"]) + metric = { "span_id": str(span_id), "trace_id": str(trace_id), @@ -1118,28 +1055,25 @@ def push(self, chunk_size: int = 300) -> None: "label": metric_payload_name, "score_value" if metric_type == "score" else "categorical_value": metric_value, "error": metric_payload_value["error"], - "join_on": { - "span": { - "trace_id": str(trace_id), - "span_id": str(span_id), - }, - } + # "join_on": { + # "span": { + # "trace_id": str(trace_id), + # "span_id": str(span_id), + # }, + # } } metrics.append(metric) - # Prepare and send chunk payload chunk_payload = { "data": { - "type": "evaluation_metric", + "type": "experiments", "attributes": {"scope": "experiments", "metrics": metrics, "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__, "experiment_id:" + experiment_id]}, } } - print("chunk_payload: ", chunk_payload) - - url = f"/api/intake/llm-obs/v2/eval-metric" + url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events" exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8")) if show_progress: diff --git a/ddtrace/llmobs/_integrations/base.py b/ddtrace/llmobs/_integrations/base.py index a6968ce0d83..25081cd0f0a 100644 --- a/ddtrace/llmobs/_integrations/base.py +++ b/ddtrace/llmobs/_integrations/base.py @@ -210,7 +210,8 @@ def llmobs_set_tags( return try: self._llmobs_set_tags(span, args, kwargs, response, operation) - except Exception: + except Exception as e: + print(e) log.error("Error extracting LLMObs fields for span %s, likely due to malformed data", span, exc_info=True) @abc.abstractmethod diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index ef5d96ffa88..5dff1ffaa43 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -53,6 +53,8 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._constants import EXPECTED_OUTPUT +from ddtrace.llmobs._constants import EXPERIMENT_INPUT +from ddtrace.llmobs._constants import EXPERIMENT_OUTPUT from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from ddtrace.llmobs._utils import AnnotationContext from ddtrace.llmobs._utils import _get_llmobs_parent_id @@ -194,8 +196,13 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") + # Experiments related if span._get_ctx_item(EXPECTED_OUTPUT) is not None: meta["expected_output"] = span._get_ctx_item(EXPECTED_OUTPUT) + if span._get_ctx_item(EXPERIMENT_INPUT) is not None: + meta["input"] = span._get_ctx_item(EXPERIMENT_INPUT) + if span._get_ctx_item(EXPERIMENT_OUTPUT) is not None: + meta["output"] = span._get_ctx_item(EXPERIMENT_OUTPUT) llmobs_span_event = { "trace_id": "{:x}".format(span.trace_id), @@ -805,6 +812,8 @@ def annotate( cls._tag_embedding_io(span, input_documents=input_data, output_text=output_data) elif span_kind == "retrieval": cls._tag_retrieval_io(span, input_text=input_data, output_documents=output_data) + elif span_kind == "experiment": + cls._tag_experiment_io(span, input_data=input_data, output_data=output_data) else: cls._tag_text_io(span, input_value=input_data, output_value=output_data) @@ -906,6 +915,16 @@ def _tag_text_io(cls, span, input_value=None, output_value=None): if output_value is not None: span._set_ctx_item(OUTPUT_VALUE, str(output_value)) + @classmethod + def _tag_experiment_io(cls, span, input_data=None, output_data=None): + """Tags input/output values for experiment kind spans. + Will be mapped to span's `meta.{input,output}.values` fields. + """ + if input_data is not None: + span._set_ctx_item(EXPERIMENT_INPUT, str(input_data)) + if output_data is not None: + span._set_ctx_item(EXPERIMENT_OUTPUT, str(output_data)) + @staticmethod def _tag_span_tags(span: Span, span_tags: Dict[str, Any]) -> None: """Tags a given LLMObs span with a dictionary of key-value tag pairs. diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 7f6be1d6fd4..39dad763389 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -190,101 +190,38 @@ def __len__(self): def _init_buffer(self): with self._lock: self._buffer = [] - self._experiment_buffer = [] self.buffer_size = 0 def put(self, events: List[LLMObsSpanEvent]): - # Split incoming events into normal vs experiment spans - norm_events = [] - exp_events = [] - for e in events: - if e.get("meta", {}).get("span.kind") == "experiment": - exp_events.append(e) - else: - norm_events.append(e) - - # Add normal spans to main buffer - if norm_events: - with self._lock: - if len(self._buffer) + len(norm_events) > self._buffer_limit: - logger.warning("Dropping normal spans: buffer limit reached") - return - self._buffer.extend(norm_events) - self.buffer_size += len(safe_json(norm_events)) - - # Add experiment spans to separate buffer - if exp_events: - with self._lock: - if len(self._experiment_buffer) + len(exp_events) > self._buffer_limit: - logger.warning("Dropping experiment spans: buffer limit reached") - return - self._experiment_buffer.extend(exp_events) - self.buffer_size += len(safe_json(exp_events)) + # events always has only 1 event - with List type to be compatible with HTTPWriter interfaces + with self._lock: + if len(self._buffer) >= self._buffer_limit: + logger.warning( + "%r event buffer full (limit is %d), dropping event", self.__class__.__name__, self._buffer_limit + ) + return + self._buffer.extend(events) + self.buffer_size += len(safe_json(events)) def encode(self): - """Encode only the normal spans for standard flush""" with self._lock: if not self._buffer: return None, 0 events = self._buffer - - # Save experiment buffer before _init_buffer() clears it - experiment_spans = self._experiment_buffer self._init_buffer() - - data = { - "_dd.stage": "raw", - "_dd.tracer_version": ddtrace.__version__, - "event_type": "span", - "spans": events - } - - if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")): - data["_dd.scope"] = "experiments" - - try: - enc_data = safe_json(data) - if isinstance(enc_data, str): - enc_data = enc_data.encode('utf-8') - logger.debug("encode %d LLMObs span events", len(events)) - except TypeError: - logger.error("failed to encode LLMObs span events", exc_info=True) - return None, 0 - - # Restore experiment buffer - with self._lock: - self._experiment_buffer = experiment_spans - - return enc_data, len(events) - - def encode_experiment_spans(self): - """Encode only the experiment spans for separate request""" - with self._lock: - if not self._experiment_buffer: - return None, 0 - exp_events = self._experiment_buffer - self._experiment_buffer = [] - - data = { - "_dd.stage": "raw", - "_dd.tracer_version": ddtrace.__version__, - "event_type": "experiment-span", - "experiment_spans": exp_events - } - + data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events} if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")): data["_dd.scope"] = "experiments" - try: - enc_data = safe_json(data) - if isinstance(enc_data, str): - enc_data = enc_data.encode('utf-8') - logger.debug("encode %d LLMObs experiment span events", len(exp_events)) + enc_llm_events = safe_json(data) + if isinstance(enc_llm_events, str): + enc_llm_events = enc_llm_events.encode('utf-8') + logger.debug("encode %d LLMObs span events to be sent", len(events)) + except TypeError: - logger.error("failed to encode LLMObs experiment span events", exc_info=True) + logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True) return None, 0 - - return enc_data, len(exp_events) + return enc_llm_events, len(events) class LLMObsEventClient(WriterClientBase): @@ -378,33 +315,6 @@ def recreate(self): is_agentless=config._llmobs_agentless_enabled, ) - def periodic(self) -> None: - # First flush normal spans using parent logic - super(LLMObsSpanWriter, self).periodic() - - # Then flush experiment spans in a separate request - for client in self._clients: - if isinstance(client, LLMObsEventClient) and isinstance(client.encoder, LLMObsSpanEncoder): - encoded, count = client.encoder.encode_experiment_spans() - if not encoded or not count: - continue - - try: - self._send_payload_with_backoff(encoded, count, client) - except Exception: - self._metrics_dist("http.errors", tags=["type:err"]) - self._metrics_dist("http.dropped.bytes", len(encoded)) - self._metrics_dist("http.dropped.traces", count) - logger.error( - "failed to send %d experiment spans to %s", - count, - self.intake_url, - exc_info=True - ) - else: - self._metrics_dist("http.sent.bytes", len(encoded)) - self._metrics_dist("http.sent.traces", count) - def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent: event["meta"]["input"] = {"value": DROPPED_VALUE_TEXT} From 9ecc7889853b5456788be1d807ff4cde88944f0b Mon Sep 17 00:00:00 2001 From: Jonathan Chavez Date: Tue, 11 Feb 2025 17:31:08 -0500 Subject: [PATCH 36/36] stabilize errors --- ddtrace/llmobs/_experiments.py | 164 +++++++++++++++++++-------------- ddtrace/llmobs/_llmobs.py | 5 +- ddtrace/llmobs/_writer.py | 1 + 3 files changed, 97 insertions(+), 73 deletions(-) diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py index 335c0ff4e30..7b6d6618db3 100644 --- a/ddtrace/llmobs/_experiments.py +++ b/ddtrace/llmobs/_experiments.py @@ -23,7 +23,7 @@ import ddtrace from ddtrace import patch_all -patch_all() +# patch_all() # TODO: remove this comment if it messes with dist tracing, right now it's needed because it overrides integrations_enabled DD_SITE = os.getenv("DD_SITE", "datadoghq.com") if DD_SITE == "datadoghq.com": @@ -31,9 +31,6 @@ else: BASE_URL = f"https://{DD_SITE}" -class FileType(Enum): - CSV = 'csv' - LLMObs.enable( ml_app="experiment-jonathan", integrations_enabled=True, @@ -42,6 +39,43 @@ class FileType(Enum): api_key=os.getenv("DD_API_KEY"), ) +IS_INITIALIZED = False +ENV_ML_APP = None +ENV_PROJECT_NAME = None +ENV_SITE = None +ENV_API_KEY = None +ENV_APPLICATION_KEY = None + +def init(project_name: str, api_key: str = None, application_key: str = None, ml_app: str = "experiments", site: str = "datadoghq.com") -> None: + """Initialize an experiment environment. + + Args: + project_name: Name of the project + api_key: Datadog API key + application_key: Datadog application key + ml_app: Name of the ML app + site: Datadog site + """ + + global IS_INITIALIZED + if IS_INITIALIZED: + raise ValueError("Experiment environment already initialized, please call init() only once") + else: + if api_key is None: + api_key = os.getenv("DD_API_KEY") + if api_key is None: + raise ValueError("DD_API_KEY environment variable is not set, please set it or pass it as an argument to init(api_key=...)") + if application_key is None: + application_key = os.getenv("DD_APPLICATION_KEY") + if application_key is None: + raise ValueError("DD_APPLICATION_KEY environment variable is not set, please set it or pass it as an argument to init(application_key=...)") + + ENV_ML_APP = ml_app + ENV_PROJECT_NAME = project_name + ENV_SITE = site + ENV_API_KEY = api_key + ENV_APPLICATION_KEY = application_key + IS_INITIALIZED = True class Dataset: @@ -331,38 +365,6 @@ def from_csv( return cls(name=name, data=data, description=description) - @classmethod - def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset": - """Import a dataset from a file. - - Args: - path (str): Path to the input file - filetype (FileType): Type of file to import (CSV, JSONL, or PARQUET) - name (str): Name of the dataset - description (str, optional): Description of the dataset. Defaults to "". - input_columns (List[str], optional): List of column names to use as input data. Required for CSV and PARQUET files. - expected_output_columns (List[str], optional): List of column names to use as expected output data. Required for CSV and PARQUET files. - metadata_columns (List[str], optional): List of column names to include as metadata. Defaults to None. - delimiter (str, optional): Delimiter character for CSV files. Defaults to ",". - - Returns: - Dataset: A new Dataset instance containing the imported data - - Raises: - ValueError: If filetype is not supported or if required columns are missing - """ - if filetype == FileType.CSV: - return cls.from_csv( - filepath=path, - name=name, - description=description, - delimiter=delimiter, - input_columns=input_columns, - expected_output_columns=expected_output_columns, - metadata_columns=metadata_columns, - ) - - raise ValueError(f"Unsupported file type: {filetype}") def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame": """Convert the dataset to a pandas DataFrame. @@ -602,7 +604,7 @@ def _create_experiment_in_datadog(self) -> str: def run( self, jobs: int = 10, - raise_errors: bool = False, + raise_errors: bool = True, ) -> "ExperimentResults": """ Execute the task and evaluations, returning the results. @@ -629,7 +631,7 @@ def run( experiment_results = self.run_evaluations(raise_errors=raise_errors) return experiment_results - def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None: + def run_task(self, _jobs: int = 10, raise_errors: bool = True) -> None: """ Execute the task function on the dataset concurrently using ThreadPoolExecutor.map, updating progress via _print_progress_bar and processing more rows in parallel. @@ -640,21 +642,22 @@ def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None: def process_row(idx_row): idx, row = idx_row start_time = time.time() - try: - with LLMObs._experiment(name=self.task.__name__) as span: - span.context.set_baggage_item("is_experiment_task", True) - input_data = row["input"] - expected_output = row["expected_output"] + with LLMObs._experiment(name=self.task.__name__) as span: + span.context.set_baggage_item("is_experiment_task", True) + span_context = LLMObs.export_span(span=span) + span_id = span_context["span_id"] + trace_id = span_context["trace_id"] + input_data = row["input"] + expected_output = row["expected_output"] + + try: + if getattr(self.task, "_accepts_config", False): output = self.task(input_data, self.config) else: output = self.task(input_data) - # Periodic flush for concurrency - if idx % 30 == 0: - LLMObs.flush() - LLMObs.annotate( span, input_data=input_data, @@ -667,9 +670,9 @@ def process_row(idx_row): ) LLMObs._tag_expected_output(span, expected_output) - span_context = LLMObs.export_span(span=span) - span_id = span_context["span_id"] - trace_id = span_context["trace_id"] + # Periodic flush for concurrency + if idx % 30 == 0: + LLMObs.flush() return { "idx": idx, @@ -686,31 +689,50 @@ def process_row(idx_row): }, "error": {"message": None, "stack": None, "type": None}, } - except Exception as e: - error_message = str(e) - # In case of an exception, span_id and trace_id are set to None - return { - "idx": idx, - "output": None, - "metadata": { - "timestamp": start_time, - "duration": time.time() - start_time, - "dataset_record_index": idx, - "project_name": self.project_name, - "experiment_name": self.name, - "dataset_name": self.dataset.name, - "span_id": None, - "trace_id": None, - }, - "error": { - "message": error_message, - "stack": traceback.format_exc(), - "type": type(e).__name__, + except Exception as e: + error_message = str(e) + span.error = 1 + span.set_exc_info(type(e), e, e.__traceback__) + + LLMObs.annotate( + span, + input_data=input_data, + tags={ + "dataset_id": self.dataset._datadog_dataset_id, + "dataset_record_id": row["record_id"], + "experiment_id": self._datadog_experiment_id, + }, + ) + LLMObs._tag_expected_output(span, expected_output) + + # Periodic flush for concurrency + if idx % 30 == 0: + LLMObs.flush() + + return { + "idx": idx, + "output": None, + "metadata": { + "timestamp": start_time, + "duration": time.time() - start_time, + "dataset_record_index": idx, + "project_name": self.project_name, + "experiment_name": self.name, + "dataset_name": self.dataset.name, + "span_id": span_id, + "trace_id": trace_id, + }, + "error": { + "message": error_message, + "stack": traceback.format_exc(), + "type": type(e).__name__, + } } - } outputs_buffer = [] completed = 0 + error_count = 0 + # Using ThreadPoolExecutor.map to process rows concurrently with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor: diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 5dff1ffaa43..d8ccefe984a 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -223,6 +223,7 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: llmobs_span_event["tags"] = cls._llmobs_tags( span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span ) + return llmobs_span_event, is_ragas_integration_span @staticmethod @@ -921,9 +922,9 @@ def _tag_experiment_io(cls, span, input_data=None, output_data=None): Will be mapped to span's `meta.{input,output}.values` fields. """ if input_data is not None: - span._set_ctx_item(EXPERIMENT_INPUT, str(input_data)) + span._set_ctx_item(EXPERIMENT_INPUT, input_data) if output_data is not None: - span._set_ctx_item(EXPERIMENT_OUTPUT, str(output_data)) + span._set_ctx_item(EXPERIMENT_OUTPUT, output_data) @staticmethod def _tag_span_tags(span: Span, span_tags: Dict[str, Any]) -> None: diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 39dad763389..8bbf15e5bec 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -221,6 +221,7 @@ def encode(self): except TypeError: logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True) return None, 0 + # print(enc_llm_events) return enc_llm_events, len(events)