From b4d6082ccd0d52e9a59e5ae4efd3f47521127e44 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Thu, 24 Oct 2024 17:46:14 -0400
Subject: [PATCH 01/36] Add main classes for experiments sdk

---
 ddtrace/llmobs/experiments.py | 238 ++++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 ddtrace/llmobs/experiments.py

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
new file mode 100644
index 00000000000..c51acb39ae5
--- /dev/null
+++ b/ddtrace/llmobs/experiments.py
@@ -0,0 +1,238 @@
+from ddtrace import config
+from typing import List, Dict, Any, Callable, Union
+import time
+import sys
+
+class Dataset:
+    def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
+        self.name = name
+        self.data = data
+        self.description = description
+        self._validate_data()
+
+    def __iter__(self) -> iter:
+        return iter(self.data)
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        return self.data[index]
+    
+    def __repr__(self) -> str:
+        header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\n"
+        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+"
+
+        def format_dict(d: Dict[str, Any]) -> List[str]:
+            def truncate(value: str) -> str:
+                return (value[:17] + '...') if len(value) > 20 else value
+
+            return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
+
+        def format_entries(entries):
+            formatted_rows = []
+            for i, entry in entries:
+                input_lines = format_dict(entry['input'])
+                expected_output_lines = format_dict(entry.get('expected_output', {}))
+                
+                # Determine the maximum number of lines in input and expected_output
+                max_lines = max(len(input_lines), len(expected_output_lines))
+                
+                # Pad the lists to have the same number of lines
+                input_lines += [''] * (max_lines - len(input_lines))
+                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
+                
+                for j in range(max_lines):
+                    if j == 0:
+                        index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |"
+                    else:
+                        index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |"
+                    formatted_rows.append(index)
+                formatted_rows.append(separator)
+            return "\n".join(formatted_rows)
+
+        if len(self.data) <= 4:
+            entries = format_entries(enumerate(self.data))
+        else:
+            first_two = format_entries(enumerate(self.data[:2]))
+            last_two = format_entries(enumerate(self.data[-2:], start=len(self.data) - 2))
+            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
+
+        table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}"
+        return f"{header}\n{table if entries else 'No entries available.'}\n\n"
+    
+    def _validate_data(self) -> None:
+        if not self.data:
+            raise ValueError("Data cannot be empty.")
+
+        if not all(isinstance(row, dict) for row in self.data):
+            raise ValueError("All rows must be dictionaries.")
+
+        first_row_keys = set(self.data[0].keys())
+        for row in self.data:
+            if set(row.keys()) != first_row_keys:
+                raise ValueError("All rows must have the same keys.")
+            
+            # Check that 'input' and 'expected_output' are flat dictionaries
+            for key in ['input', 'expected_output']:
+                if key in row and any(isinstance(value, dict) for value in row[key].values()):
+                    raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).")
+
+    @classmethod
+    def from_datadog(cls, name: str) -> 'Dataset':
+        # TODO: Implement this
+        pass
+
+    def push(self) -> None:
+        # TODO: Implement this
+        print(config._dd_api_key)
+        pass
+
+
+class Experiment:
+    def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable]) -> None:
+        self.name = name
+        self.task = task
+        self.dataset = dataset
+        self.evaluators = evaluators
+
+    def __repr__(self) -> str:
+        separator = "+" + "-"*20 + "+" + "-"*50 + "+"
+        
+        def format_evaluator(evaluator: Callable) -> str:
+            return f"{evaluator.__name__}"
+
+        evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators]
+        evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
+
+        table = (
+            f"{separator}\n"
+            f"| {'Experiment':<18} | {self.name:<48} |\n"
+            f"{separator}\n"
+            f"| {'Task':<18} | {self.task.__name__:<48} |\n"
+            f"| {'Dataset':<18} | {f'{self.dataset.name} (n={len(self.dataset)})':<48} |\n"
+            f"| {'Evaluators':<18} | {evaluators:<48} |\n"
+            f"{separator}"
+        )
+        return table
+
+    def _validate_tasks(self) -> None:
+        # TODO: Implement this
+        pass
+
+    def _validate_evaluators(self) -> None:
+        # TODO: Implement this
+        pass
+
+    def run(self) -> 'ExperimentResults':
+        results = ExperimentResults(self.dataset)
+        total_rows = len(self.dataset)
+
+        for idx, row in enumerate(self.dataset, 0):
+            # Apply the task function to the row
+            start_time = time.time()
+            output = self.task(row)
+            end_time = time.time()
+            duration = end_time - start_time
+
+            # Store the results
+            results.experiment_rows.append({
+                "output": output,
+                "evaluations": [],
+                "metadata": {
+                    "duration": duration,
+                    "timestamp": start_time
+                }
+            })
+
+            def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]:
+                return {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
+
+            results.experiment_rows[idx]["evaluations"] = _evaluate_row(row, output)
+
+            # Update progress
+            progress = int(50 * idx / total_rows)  # Progress bar length of 50
+            bar = '=' * progress + ' ' * (50 - progress)
+            percent = int(100 * idx / total_rows)
+            sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({idx}/{total_rows})')
+            sys.stdout.flush()
+
+        # Print a new line after completion
+        sys.stdout.write('\n')
+
+        return results
+
+
+class ExperimentResults:
+    def __init__(self, dataset: Dataset) -> None:
+        self.dataset = dataset
+        self.experiment_rows = []
+
+    def __repr__(self) -> str:
+        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+"
+
+        def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
+            if isinstance(d, dict):
+                def truncate(value: str) -> str:
+                    return (value[:17] + '...') if len(value) > 20 else value
+
+                return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
+            elif isinstance(d, list):
+                return [str(item) for item in d]
+            else:
+                return [str(d)]
+
+        def format_entries(entries):
+            formatted_rows = []
+            for i, entry in enumerate(entries):
+                dataset_entry = self.dataset[i]
+                input_lines = format_dict(dataset_entry['input'])
+                expected_output_lines = format_dict(dataset_entry.get('expected_output', {}))
+                output_lines = format_dict(entry['output'])
+                evaluations_lines = format_dict(entry.get('evaluations', []))
+                
+                # Determine the maximum number of lines across all fields
+                max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines))
+                
+                # Pad the lists to have the same number of lines
+                input_lines += [''] * (max_lines - len(input_lines))
+                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
+                output_lines += [''] * (max_lines - len(output_lines))
+                evaluations_lines += [''] * (max_lines - len(evaluations_lines))
+                
+                for j in range(max_lines):
+                    if j == 0:
+                        index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
+                    else:
+                        index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
+                    formatted_rows.append(index)
+                formatted_rows.append(separator)
+            return "\n".join(formatted_rows)
+
+        if len(self.experiment_rows) <= 4:
+            entries = format_entries(self.experiment_rows)
+        else:
+            first_two = format_entries(self.experiment_rows[:2])
+            last_two = format_entries(self.experiment_rows[-2:])
+            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
+
+        table = (
+            f"{separator}\n"
+            f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n"
+            f"{separator}\n"
+            f"{entries}"
+        )
+        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
+
+    def __iter__(self) -> iter:
+        return iter(self.experiment_rows)
+    
+    def __len__(self) -> int:
+        return len(self.experiment_rows)
+
+    def __getitem__(self, index: int) -> Any:
+        return self.experiment_rows[index]
+
+    def push(self) -> None:
+        # TODO: Implement this
+        pass

From f9e929629cb9c0e401112f70c1139bd6f1ec6827 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Thu, 24 Oct 2024 18:14:29 -0400
Subject: [PATCH 02/36] Added more things but don't remember what

---
 ddtrace/llmobs/experiments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
index c51acb39ae5..798854ed562 100644
--- a/ddtrace/llmobs/experiments.py
+++ b/ddtrace/llmobs/experiments.py
@@ -117,11 +117,11 @@ def format_evaluator(evaluator: Callable) -> str:
         return table
 
     def _validate_tasks(self) -> None:
-        # TODO: Implement this
+        # TODO: Design and implement this
         pass
 
     def _validate_evaluators(self) -> None:
-        # TODO: Implement this
+        # TODO: Design and implement this
         pass
 
     def run(self) -> 'ExperimentResults':

From 60f3ba5ad488e1357fbf0e5278d29b84514bf2d5 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 28 Oct 2024 17:28:57 -0400
Subject: [PATCH 03/36] Add network calls for main methods

---
 ddtrace/llmobs/experiments.py | 456 ++++++++++++++++++++++++++++++++--
 1 file changed, 434 insertions(+), 22 deletions(-)

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
index 798854ed562..8d43d47415c 100644
--- a/ddtrace/llmobs/experiments.py
+++ b/ddtrace/llmobs/experiments.py
@@ -1,14 +1,28 @@
-from ddtrace import config
-from typing import List, Dict, Any, Callable, Union
-import time
+from datetime import datetime
+from http.client import HTTPSConnection
+import hashlib
+import json
+import os
+from typing import Any, Callable, Dict, List, Union
 import sys
+import time
+from urllib.parse import quote
+
+# Constants
+BASE_URL = "api.datadoghq.com"
+PROJECT_NAME = "sdk-testing"
+
 
 class Dataset:
     def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
         self.name = name
+        self._validate_data(data)
         self.data = data
         self.description = description
-        self._validate_data()
+
+        # Post-push attributes
+        self.datadog_dataset_id = None
+        
 
     def __iter__(self) -> iter:
         return iter(self.data)
@@ -18,9 +32,10 @@ def __len__(self) -> int:
 
     def __getitem__(self, index: int) -> Dict[str, Any]:
         return self.data[index]
+        
     
     def __repr__(self) -> str:
-        header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\n"
+        header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n"
         separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+"
 
         def format_dict(d: Dict[str, Any]) -> List[str]:
@@ -29,7 +44,7 @@ def truncate(value: str) -> str:
 
             return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
 
-        def format_entries(entries):
+        def format_entries(entries):  # Fixed indentation - this was nested too deeply
             formatted_rows = []
             for i, entry in entries:
                 input_lines = format_dict(entry['input'])
@@ -61,15 +76,15 @@ def format_entries(entries):
         table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}"
         return f"{header}\n{table if entries else 'No entries available.'}\n\n"
     
-    def _validate_data(self) -> None:
-        if not self.data:
+    def _validate_data(self, data: List[Dict[str, Any]]) -> None:
+        if not data:
             raise ValueError("Data cannot be empty.")
 
-        if not all(isinstance(row, dict) for row in self.data):
+        if not all(isinstance(row, dict) for row in data):
             raise ValueError("All rows must be dictionaries.")
 
-        first_row_keys = set(self.data[0].keys())
-        for row in self.data:
+        first_row_keys = set(data[0].keys())
+        for row in data:
             if set(row.keys()) != first_row_keys:
                 raise ValueError("All rows must have the same keys.")
             
@@ -80,13 +95,156 @@ def _validate_data(self) -> None:
 
     @classmethod
     def from_datadog(cls, name: str) -> 'Dataset':
-        # TODO: Implement this
-        pass
+        """Create a dataset from a dataset hosted in Datadog.
+
+        Args:
+            name: Name of the dataset to retrieve from Datadog
+
+        Returns:
+            Dataset: A new Dataset instance populated with the records from Datadog
+
+        Raises:
+            ValueError: If the dataset is not found
+            Exception: If there are HTTP errors during the request
+        """
+        conn = HTTPSConnection(BASE_URL)
+        headers = {
+            "DD-API-KEY": os.getenv("DD_API_KEY"),
+            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
+            "Content-Type": "application/json"
+        }
+
+        try:
+            # Get dataset ID
+            encoded_name = quote(name)
+            url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
+            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
+            datasets = response_data.get('data', [])
+
+            if not datasets:
+                raise ValueError(f"Dataset '{name}' not found")
+
+            dataset_id = datasets[0]['id']
+
+            # Get dataset records
+            url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+            records_data = _make_request(conn, headers, "GET", url, context="Records lookup")
+            
+            # Transform records into the expected format
+            class_records = []
+            for record in records_data.get('data', []):
+                attrs = record.get('attributes', {})
+                class_records.append({
+                    "input": attrs.get('input', {}),
+                    "expected_output": attrs.get('expected_output', {}),
+                    **attrs.get('metadata', {})
+                })
+
+            # Create new dataset instance
+            dataset = cls(name, class_records)
+            dataset.datadog_dataset_id = dataset_id
+            return dataset
+
+        finally:
+            conn.close()
+
+    def push(self) -> Dict[str, str]:
+        """Push the dataset to Datadog.
+        
+        Returns:
+            Dict[str, str]: Dictionary containing dataset information including:
+                - dataset_id: The ID of the created/updated dataset
+                - dataset_name: The name of the dataset
+                - record_count: Number of records uploaded
+        """
+        # Initialize connection and headers
+        conn = HTTPSConnection(BASE_URL)
+        headers = {
+            "DD-API-KEY": os.getenv("DD_API_KEY"),
+            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
+            "Content-Type": "application/json"
+        }
+
+        try:
+            # Check if dataset exists
+            encoded_name = quote(self.name)
+            url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
+            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
+            datasets = response_data.get('data', [])
+
+            if not datasets:
+                # Create new dataset
+                print(f"Dataset '{self.name}' not found. Creating it.")
+                dataset_payload = {
+                    "data": {
+                        "type": "datasets",
+                        "attributes": {
+                            "name": self.name,
+                            "description": self.description or f"Dataset used for {self.name}",
+                            "metadata": {"team": "ml-obs"}
+                        }
+                    }
+                }
+                response_data = _make_request(
+                    conn,
+                    headers,
+                    "POST",
+                    "/api/unstable/llm-obs/v1/datasets",
+                    body=json.dumps(dataset_payload),
+                    context="Dataset creation"
+                )
+                dataset_id = response_data['data']['id']
+                self.datadog_dataset_id = dataset_id
+            else:
+                # Dataset exists, create a new version
+                dataset_id = datasets[0]['id']
+                version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+                new_dataset_name = f"{self.name}-{version_suffix}"
+                print(f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'.")
+                dataset_payload = {
+                    "data": {
+                        "type": "datasets",
+                        "attributes": {
+                            "name": new_dataset_name,
+                            "description": f"Dataset versioned on {version_suffix} used for {self.name}",
+                            "metadata": {"team": "ml-obs"}
+                        }
+                    }
+                }
+                response_data = _make_request(
+                    conn,
+                    headers,
+                    "POST",
+                    "/api/unstable/llm-obs/v1/datasets",
+                    body=json.dumps(dataset_payload),
+                    context="Dataset version creation"
+                )
+                dataset_id = response_data['data']['id']
+                self.datadog_dataset_id = dataset_id
+                self.name = new_dataset_name
+
+            # Add records to the dataset
+            records_payload = {
+                "data": {
+                    "type": "datasets",
+                    "attributes": {
+                        "records": self.data
+                    }
+                }
+            }
+            url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+            _make_request(conn, headers, "POST", url, body=json.dumps(records_payload), context="Adding records")
+
+            print(f"✓ Successfully uploaded dataset '{self.name}'")
+            print(f"  • Dataset ID: {dataset_id}")
+            print(f"  • Records uploaded: {len(self.data)}")
+            
+            return self
+
+        finally:
+            conn.close()
+
 
-    def push(self) -> None:
-        # TODO: Implement this
-        print(config._dd_api_key)
-        pass
 
 
 class Experiment:
@@ -95,6 +253,12 @@ def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List
         self.task = task
         self.dataset = dataset
         self.evaluators = evaluators
+        self.tags = []
+
+        # Post-run attributes
+        self.has_run = False
+        self.results = None
+
 
     def __repr__(self) -> str:
         separator = "+" + "-"*20 + "+" + "-"*50 + "+"
@@ -139,10 +303,17 @@ def run(self) -> 'ExperimentResults':
             results.experiment_rows.append({
                 "output": output,
                 "evaluations": [],
+                
                 "metadata": {
+                    "timestamp": start_time,
                     "duration": duration,
-                    "timestamp": start_time
-                }
+                    "dataset_record_idx": idx,
+                    "project_name": PROJECT_NAME,
+                    "experiment_name": self.name,
+                    "dataset_name": self.dataset.name,
+                },
+                "tags": self.tags,
+                "error": None
             })
 
             def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]:
@@ -160,7 +331,217 @@ def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]
         # Print a new line after completion
         sys.stdout.write('\n')
 
+        self.has_run = True
+        self.results = results
         return results
+    
+    def get_results(self) -> 'ExperimentResults':
+        if not self.has_run:
+            raise ValueError("Experiment has not been run yet")
+        return self.results
+    
+    def push(self) -> Dict[str, str]:
+        """Push the experiment results to Datadog.
+        
+        Returns:
+            Dict[str, str]: Dictionary containing experiment information including:
+                - experiment_id: The ID of the created experiment
+                - experiment_name: The name of the experiment
+                - span_count: Number of spans uploaded
+        """
+        if not self.has_run:
+            raise ValueError("Experiment has not been run yet")
+
+        # Initialize connection and headers
+        conn = HTTPSConnection(BASE_URL)
+        headers = {
+            "DD-API-KEY": os.getenv("DD_API_KEY"),
+            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
+            "Content-Type": "application/json"
+        }
+
+        try:
+            # Check if project exists
+            url = f"/api/unstable/llm-obs/v1/projects?filter[name]={PROJECT_NAME}"
+            response_data = _make_request(conn, headers, "GET", url, context="Project lookup")
+            projects = response_data.get('data', [])
+
+            if not projects:
+                # Create new project
+                print(f"Project '{PROJECT_NAME}' not found. Creating it.")
+                project_payload = {
+                    "data": {
+                        "type": "projects",
+                        "attributes": {
+                            "name": PROJECT_NAME,
+                            "description": f"Project for {PROJECT_NAME}",
+                            "metadata": {"team": "ml-obs"}
+                        }
+                    }
+                }
+                response_data = _make_request(
+                    conn,
+                    headers,
+                    "POST",
+                    "/api/unstable/llm-obs/v1/projects",
+                    body=json.dumps(project_payload),
+                    context="Project creation"
+                )
+                project_id = response_data['data']['id']
+            else:
+                project_id = projects[0]['id']
+
+            # Check if experiment exists
+            encoded_name = quote(self.name)
+            url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
+            response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup")
+            experiments = response_data.get('data', [])
+
+            if not experiments:
+                # Create new experiment
+                print(f"Experiment '{self.name}' not found. Creating it.")
+                experiment_payload = {
+                    "data": {
+                        "type": "experiments",
+                        "attributes": {
+                            "name": self.name,
+                            "description": f"Experiment: {self.name} on dataset: {self.dataset.name}",
+                            "dataset_id": self.dataset.datadog_dataset_id,
+                            "project_id": project_id,
+                            "metadata": {
+                                "tags": self.tags,
+                                "team": "ml-obs"
+                            }
+                        }
+                    }
+                }
+                response_data = _make_request(
+                    conn,
+                    headers,
+                    "POST",
+                    "/api/unstable/llm-obs/v1/experiments",
+                    body=json.dumps(experiment_payload),
+                    context="Experiment creation"
+                )
+                experiment_id = response_data['data']['id']
+            else:
+                # Experiment exists, create a new version
+                version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+                new_experiment_name = f"{self.name}-{version_suffix}"
+                print(f"Experiment '{self.name}' found. Creating new version '{new_experiment_name}'.")
+                experiment_payload = {
+                    "data": {
+                        "type": "experiments",
+                        "attributes": {
+                            "name": new_experiment_name,
+                            "description": f"Experiment versioned on {version_suffix} used for {self.name}",
+                            "dataset_id": self.dataset.datadog_dataset_id,
+                            "project_id": project_id,
+                            "metadata": {
+                                "tags": self.tags,
+                                "team": "ml-obs"
+                            }
+                        }
+                    }
+                }
+                response_data = _make_request(
+                    conn,
+                    headers,
+                    "POST",
+                    "/api/unstable/llm-obs/v1/experiments",
+                    body=json.dumps(experiment_payload),
+                    context="Experiment version creation"
+                )
+                experiment_id = response_data['data']['id']
+                self.name = new_experiment_name
+
+            # Prepare and send experiment results
+            spans = []
+            metrics = []
+
+            
+            
+            for idx, result in enumerate(self.results):
+                
+                span = {
+                    "span_id": _make_id(),
+                    "project_id": project_id,
+                    "experiment_id": experiment_id,
+                    "dataset_id": self.dataset.datadog_dataset_id,
+                    "dataset_record_id": _make_id(),
+                    "start_ns": int(result['metadata']['timestamp'] * 1e9),
+                    "duration": float(result['metadata']['duration'] * 1e9),
+                    "tags": self.tags,
+                    "status": "ok",
+                    "meta": {
+                        "span": {"kind": "experiment"},
+                        "input": self.dataset[idx]['input'],
+                        "output": result['output'],
+                        "expected_output": self.dataset[idx].get('expected_output', {}),
+                        "error": {
+                           "message": result['error'],
+                           "stack": None,
+                           "type": None 
+                        }
+                    }
+                    
+                }
+                spans.append(span)
+
+                # Add evaluation metrics
+                for metric_name, metric_value in result['evaluations'].items():
+                    timestamp_ms = int(result['metadata']['timestamp'] * 1000)
+                    
+                    if isinstance(metric_value, bool):
+                        metric_value = 1 if metric_value else 0
+                        metric_type = "score" 
+                    elif isinstance(metric_value, (int, float)):
+                        metric_type = "score"
+                    else:
+                        metric_type = "categorical"
+                        metric_value = str(metric_value)
+
+                    metric = {
+                        "span_id": span['span_id'],
+                        "metric_type": metric_type,
+                        "timestamp_ms": timestamp_ms,
+                        "label": metric_name,
+                        "score_value" if metric_type == "score" else "categorical_value": metric_value
+                    }
+                    metrics.append(metric)
+
+            results_payload = {
+                "data": {
+                    "type": "experiments",
+                    "attributes": {
+                        "spans": spans,
+                        "metrics": metrics
+                    }
+                }
+            }
+
+
+            url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
+            _make_request(
+                conn,
+                headers,
+                "POST",
+                url,
+                body=json.dumps(results_payload),
+                context="Publishing results"
+            )
+
+            print(f"✓ Successfully uploaded experiment '{self.name}'")
+            print(f"  • Experiment ID: {experiment_id}")
+            print(f"  • Spans uploaded: {len(spans)}")
+            print(f"  • Metrics uploaded: {len(metrics)}")
+            
+            return self
+
+        finally:
+            conn.close()
+
+
 
 
 class ExperimentResults:
@@ -233,6 +614,37 @@ def __len__(self) -> int:
     def __getitem__(self, index: int) -> Any:
         return self.experiment_rows[index]
 
-    def push(self) -> None:
-        # TODO: Implement this
-        pass
+
+
+def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, url: str, body: Any = None, context: str = "") -> Dict[str, Any]:
+    if method == "GET":
+        conn.request(method, url, headers=headers)
+    else:
+        if body is not None and isinstance(body, str):
+            body = body.encode('utf-8')
+        conn.request(method, url, body=body, headers=headers)
+    
+    response = conn.getresponse()
+    response_body = response.read()
+    
+    if response.status >= 400:
+        error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}"
+        raise Exception(error_message)
+    
+    # Add handling for empty response
+    if not response_body:
+        return {}  # Return empty dict for empty responses
+        
+    try:
+        return json.loads(response_body)
+    except json.JSONDecodeError:
+        error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}"
+        raise Exception(error_message)
+
+def _make_id() -> str:
+    return hashlib.sha256(datetime.now().isoformat().encode('utf-8')).hexdigest()
+
+
+
+
+

From d48942d0c554c6a487a56fa851dc95ff01c3ad08 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 10:41:33 -0400
Subject: [PATCH 04/36] Add docstring

---
 ddtrace/llmobs/experiments.py | 513 ++++++++++++++++++++++++----------
 1 file changed, 364 insertions(+), 149 deletions(-)

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
index 8d43d47415c..5781b1e9572 100644
--- a/ddtrace/llmobs/experiments.py
+++ b/ddtrace/llmobs/experiments.py
@@ -3,17 +3,49 @@
 import hashlib
 import json
 import os
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List, Union, Optional, Iterator
 import sys
 import time
 from urllib.parse import quote
+import concurrent.futures
+import itertools
+import uuid
 
 # Constants
 BASE_URL = "api.datadoghq.com"
-PROJECT_NAME = "sdk-testing"
+
+
+def _validate_api_keys() -> None:
+    """Validate that required Datadog API keys are set in environment variables.
+
+    Raises:
+        ValueError: If any required API keys are missing from environment variables
+    """
+    missing_keys = []
+    for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]:
+        if not os.getenv(key):
+            missing_keys.append(key)
+    
+    if missing_keys:
+        raise ValueError(
+            f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. "
+            "Please set these environment variables before pushing to Datadog."
+        )
 
 
 class Dataset:
+    """A container for LLM experiment data that can be pushed to and retrieved from Datadog.
+
+    This class manages collections of input/output pairs used for LLM experiments,
+    with functionality to validate, push to Datadog, and retrieve from Datadog.
+
+    Attributes:
+        name (str): Name of the dataset
+        data (List[Dict[str, Any]]): List of records containing input/output pairs
+        description (str): Optional description of the dataset
+        datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed)
+    """
+
     def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
         self.name = name
         self._validate_data(data)
@@ -24,7 +56,7 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "")
         self.datadog_dataset_id = None
         
 
-    def __iter__(self) -> iter:
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self.data)
 
     def __len__(self) -> int:
@@ -77,9 +109,22 @@ def format_entries(entries):  # Fixed indentation - this was nested too deeply
         return f"{header}\n{table if entries else 'No entries available.'}\n\n"
     
     def _validate_data(self, data: List[Dict[str, Any]]) -> None:
+        """Validate the format and structure of dataset records.
+
+        Args:
+            data: List of dataset records to validate
+
+        Raises:
+            ValueError: If data is empty, contains non-dictionary rows,
+                       has inconsistent keys, contains nested dictionaries,
+                       or exceeds 50,000 rows
+        """
         if not data:
             raise ValueError("Data cannot be empty.")
 
+        if len(data) > 50000:
+            raise ValueError("Dataset cannot exceed 50,000 rows.")
+
         if not all(isinstance(row, dict) for row in data):
             raise ValueError("All rows must be dictionaries.")
 
@@ -107,6 +152,7 @@ def from_datadog(cls, name: str) -> 'Dataset':
             ValueError: If the dataset is not found
             Exception: If there are HTTP errors during the request
         """
+        _validate_api_keys()
         conn = HTTPSConnection(BASE_URL)
         headers = {
             "DD-API-KEY": os.getenv("DD_API_KEY"),
@@ -157,7 +203,7 @@ def push(self) -> Dict[str, str]:
                 - dataset_name: The name of the dataset
                 - record_count: Number of records uploaded
         """
-        # Initialize connection and headers
+        _validate_api_keys()
         conn = HTTPSConnection(BASE_URL)
         headers = {
             "DD-API-KEY": os.getenv("DD_API_KEY"),
@@ -248,13 +294,29 @@ def push(self) -> Dict[str, str]:
 
 
 class Experiment:
-    def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable]) -> None:
+    """Manages the execution and evaluation of LLM tasks on a dataset.
+
+    This class handles running tasks against datasets, applying evaluators,
+    and collecting results for analysis.
+
+    Attributes:
+        name (str): Name of the experiment
+        task (Callable): Function that processes each dataset record
+        dataset (Dataset): Dataset to run the experiment on
+        evaluators (List[Callable]): Functions that evaluate task outputs
+        tags (List[str]): Tags for organizing experiments
+        project_name (str): Name of the project this experiment belongs to
+        has_run (bool): Whether the experiment has been executed
+        results (ExperimentResults): Results after running the experiment
+    """
+
+    def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable], tags: List[str] = [], project_name: str = "-") -> None:
         self.name = name
         self.task = task
         self.dataset = dataset
         self.evaluators = evaluators
         self.tags = []
-
+        self.project_name = project_name
         # Post-run attributes
         self.has_run = False
         self.results = None
@@ -288,45 +350,112 @@ def _validate_evaluators(self) -> None:
         # TODO: Design and implement this
         pass
 
-    def run(self) -> 'ExperimentResults':
-        results = ExperimentResults(self.dataset)
+    def _validate_tags(self) -> None:
+        """Validate experiment tags format.
+
+        Raises:
+            ValueError: If any tag doesn't follow the 'key:value' format
+        """
+        for tag in self.tags:
+            if not isinstance(tag, str) or ":" not in tag:
+                raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.")
+
+    def run(self, _jobs: int = 10) -> 'ExperimentResults':
+        """Execute the experiment on the dataset.
+
+        Runs the task function on each dataset record in parallel and collects
+        results and evaluations.
+
+        Args:
+            _jobs (int, optional): Number of parallel workers. Defaults to 10.
+                Must be between 1 and 20.
+
+        Returns:
+            ExperimentResults: Object containing the experiment results
+
+        Raises:
+            ValueError: If _jobs is not between 1 and 20
+        """
+        if not 1 <= _jobs <= 20:
+            raise ValueError("Number of jobs must be between 1 and 20")
+
+        results = ExperimentResults(self.dataset, self)
         total_rows = len(self.dataset)
 
-        for idx, row in enumerate(self.dataset, 0):
-            # Apply the task function to the row
-            start_time = time.time()
-            output = self.task(row)
-            end_time = time.time()
-            duration = end_time - start_time
-
-            # Store the results
-            results.experiment_rows.append({
-                "output": output,
-                "evaluations": [],
-                
-                "metadata": {
-                    "timestamp": start_time,
-                    "duration": duration,
-                    "dataset_record_idx": idx,
-                    "project_name": PROJECT_NAME,
-                    "experiment_name": self.name,
-                    "dataset_name": self.dataset.name,
-                },
-                "tags": self.tags,
-                "error": None
-            })
-
-            def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]:
-                return {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
-
-            results.experiment_rows[idx]["evaluations"] = _evaluate_row(row, output)
-
-            # Update progress
-            progress = int(50 * idx / total_rows)  # Progress bar length of 50
-            bar = '=' * progress + ' ' * (50 - progress)
-            percent = int(100 * idx / total_rows)
-            sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({idx}/{total_rows})')
-            sys.stdout.flush()
+        def process_row(idx_row):
+            idx, row = idx_row
+            try:
+                # Apply the task function to the row
+                start_time = time.time()
+                output = self.task(row)
+                end_time = time.time()
+                duration = end_time - start_time
+
+                # Evaluate the output
+                evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
+
+                # Store the result
+                return {
+                    "idx": idx,
+                    "result": {
+                        "output": output,
+                        "evaluations": evaluations,
+                        "metadata": {
+                            "timestamp": start_time,
+                            "duration": duration,
+                            "dataset_record_idx": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                        },
+                        "tags": self.tags,
+                        "error": None
+                    }
+                }
+            except Exception as e:
+                # Handle exceptions and store the error
+                return {
+                    "idx": idx,
+                    "result": {
+                        "output": None,
+                        "evaluations": {},
+                        "metadata": {
+                            "timestamp": time.time(),
+                            "duration": 0,
+                            "dataset_record_idx": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                        },
+                        "tags": self.tags,
+                        "error": str(e)
+                    }
+                }
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
+            # Create futures list first
+            future_to_idx = {
+                executor.submit(process_row, (idx, row)): idx 
+                for idx, row in enumerate(self.dataset)
+            }
+            
+            # Process as they complete while maintaining order
+            completed = 0
+            results_buffer = [None] * total_rows
+            for future in concurrent.futures.as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                results_buffer[idx] = future.result()['result']
+                completed += 1
+
+                # Update progress
+                progress = int(50 * completed / total_rows)
+                bar = '=' * progress + ' ' * (50 - progress)
+                percent = int(100 * completed / total_rows)
+                sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})')
+                sys.stdout.flush()
+
+            # Add results in correct order
+            results.experiment_rows = results_buffer
 
         # Print a new line after completion
         sys.stdout.write('\n')
@@ -334,12 +463,98 @@ def _evaluate_row(row: Dict[str, Any], output: Dict[str, Any]) -> Dict[str, Any]
         self.has_run = True
         self.results = results
         return results
-    
-    def get_results(self) -> 'ExperimentResults':
+
+    def get_results(self) -> Union['ExperimentResults', List['ExperimentResults']]:
         if not self.has_run:
             raise ValueError("Experiment has not been run yet")
         return self.results
     
+
+
+
+class ExperimentResults:
+    """Contains and manages the results of an experiment run.
+
+    Stores the outputs, evaluations, and metadata for each record processed
+    in an experiment, with functionality to analyze and push results to Datadog.
+
+    Attributes:
+        dataset (Dataset): The dataset used in the experiment
+        experiment (Experiment): The experiment that generated these results
+        experiment_rows (List[Dict]): Results for each processed record
+    """
+
+    def __init__(self, dataset: Dataset, experiment: Experiment) -> None:
+        self.dataset = dataset
+        self.experiment = experiment
+        self.experiment_rows = []
+
+    def __repr__(self) -> str:
+        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+"
+
+        def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
+            if isinstance(d, dict):
+                def truncate(value: str) -> str:
+                    return (value[:17] + '...') if len(value) > 20 else value
+
+                return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
+            elif isinstance(d, list):
+                return [str(item) for item in d]
+            else:
+                return [str(d)]
+
+        def format_entries(entries):
+            formatted_rows = []
+            for i, entry in enumerate(entries):
+                dataset_idx = entry['metadata']['dataset_record_idx']
+                dataset_entry = self.dataset[dataset_idx]
+                input_lines = format_dict(dataset_entry['input'])
+                expected_output_lines = format_dict(dataset_entry.get('expected_output', {}))
+                output_lines = format_dict(entry['output'])
+                evaluations_lines = format_dict(entry.get('evaluations', []))
+                
+                # Determine the maximum number of lines across all fields
+                max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines))
+                
+                # Pad the lists to have the same number of lines
+                input_lines += [''] * (max_lines - len(input_lines))
+                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
+                output_lines += [''] * (max_lines - len(output_lines))
+                evaluations_lines += [''] * (max_lines - len(evaluations_lines))
+                
+                for j in range(max_lines):
+                    if j == 0:
+                        index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
+                    else:
+                        index = f"|{'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
+                    formatted_rows.append(index)
+                formatted_rows.append(separator)
+            return "\n".join(formatted_rows)
+
+        if len(self.experiment_rows) <= 4:
+            entries = format_entries(self.experiment_rows)
+        else:
+            first_two = format_entries(self.experiment_rows[:2])
+            last_two = format_entries(self.experiment_rows[-2:])
+            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
+
+        table = (
+            f"{separator}\n"
+            f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n"
+            f"{separator}\n"
+            f"{entries}"
+        )
+        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
+
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        return iter(self.experiment_rows)
+    
+    def __len__(self) -> int:
+        return len(self.experiment_rows)
+
+    def __getitem__(self, index: int) -> Any:
+        return self.experiment_rows[index]
+    
     def push(self) -> Dict[str, str]:
         """Push the experiment results to Datadog.
         
@@ -349,9 +564,8 @@ def push(self) -> Dict[str, str]:
                 - experiment_name: The name of the experiment
                 - span_count: Number of spans uploaded
         """
-        if not self.has_run:
-            raise ValueError("Experiment has not been run yet")
-
+        _validate_api_keys()
+        
         # Initialize connection and headers
         conn = HTTPSConnection(BASE_URL)
         headers = {
@@ -362,19 +576,19 @@ def push(self) -> Dict[str, str]:
 
         try:
             # Check if project exists
-            url = f"/api/unstable/llm-obs/v1/projects?filter[name]={PROJECT_NAME}"
+            url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
             response_data = _make_request(conn, headers, "GET", url, context="Project lookup")
             projects = response_data.get('data', [])
 
             if not projects:
                 # Create new project
-                print(f"Project '{PROJECT_NAME}' not found. Creating it.")
+                print(f"Project '{self.experiment.project_name}' not found. Creating it.")
                 project_payload = {
                     "data": {
                         "type": "projects",
                         "attributes": {
-                            "name": PROJECT_NAME,
-                            "description": f"Project for {PROJECT_NAME}",
+                            "name": self.experiment.project_name,
+                            "description": f"Project for {self.experiment.project_name}",
                             "metadata": {"team": "ml-obs"}
                         }
                     }
@@ -392,24 +606,24 @@ def push(self) -> Dict[str, str]:
                 project_id = projects[0]['id']
 
             # Check if experiment exists
-            encoded_name = quote(self.name)
+            encoded_name = quote(self.experiment.name)
             url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
             response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup")
             experiments = response_data.get('data', [])
 
             if not experiments:
                 # Create new experiment
-                print(f"Experiment '{self.name}' not found. Creating it.")
+                print(f"Experiment '{self.experiment.name}' not found. Creating it.")
                 experiment_payload = {
                     "data": {
                         "type": "experiments",
                         "attributes": {
-                            "name": self.name,
-                            "description": f"Experiment: {self.name} on dataset: {self.dataset.name}",
-                            "dataset_id": self.dataset.datadog_dataset_id,
+                            "name": self.experiment.name,
+                            "description": f"Experiment: {self.experiment.name} on dataset: {self.experiment.dataset.name}",
+                            "dataset_id": self.experiment.dataset.datadog_dataset_id,
                             "project_id": project_id,
                             "metadata": {
-                                "tags": self.tags,
+                                "tags": self.experiment.tags,
                                 "team": "ml-obs"
                             }
                         }
@@ -427,18 +641,18 @@ def push(self) -> Dict[str, str]:
             else:
                 # Experiment exists, create a new version
                 version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-                new_experiment_name = f"{self.name}-{version_suffix}"
-                print(f"Experiment '{self.name}' found. Creating new version '{new_experiment_name}'.")
+                new_experiment_name = f"{self.experiment.name}-{version_suffix}"
+                print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.")
                 experiment_payload = {
                     "data": {
                         "type": "experiments",
                         "attributes": {
                             "name": new_experiment_name,
-                            "description": f"Experiment versioned on {version_suffix} used for {self.name}",
-                            "dataset_id": self.dataset.datadog_dataset_id,
+                            "description": f"Experiment versioned on {version_suffix} used for {self.experiment.name}",
+                            "dataset_id": self.experiment.dataset.datadog_dataset_id,
                             "project_id": project_id,
                             "metadata": {
-                                "tags": self.tags,
+                                "tags": self.experiment.tags,
                                 "team": "ml-obs"
                             }
                         }
@@ -453,7 +667,7 @@ def push(self) -> Dict[str, str]:
                     context="Experiment version creation"
                 )
                 experiment_id = response_data['data']['id']
-                self.name = new_experiment_name
+                self.experiment.name = new_experiment_name
 
             # Prepare and send experiment results
             spans = []
@@ -461,27 +675,27 @@ def push(self) -> Dict[str, str]:
 
             
             
-            for idx, result in enumerate(self.results):
+            for idx, result in enumerate(self.experiment_rows):
                 
                 span = {
                     "span_id": _make_id(),
                     "project_id": project_id,
                     "experiment_id": experiment_id,
-                    "dataset_id": self.dataset.datadog_dataset_id,
+                    "dataset_id": self.experiment.dataset.datadog_dataset_id,
                     "dataset_record_id": _make_id(),
                     "start_ns": int(result['metadata']['timestamp'] * 1e9),
                     "duration": float(result['metadata']['duration'] * 1e9),
-                    "tags": self.tags,
+                    "tags": self.experiment.tags,
                     "status": "ok",
                     "meta": {
                         "span": {"kind": "experiment"},
-                        "input": self.dataset[idx]['input'],
+                        "input": self.experiment.dataset[idx]['input'],
                         "output": result['output'],
-                        "expected_output": self.dataset[idx].get('expected_output', {}),
+                        "expected_output": self.experiment.dataset[idx].get('expected_output', {}),
                         "error": {
-                           "message": result['error'],
-                           "stack": None,
-                           "type": None 
+                            "message": result['error'],
+                            "stack": None,
+                            "type": None 
                         }
                     }
                     
@@ -531,7 +745,7 @@ def push(self) -> Dict[str, str]:
                 context="Publishing results"
             )
 
-            print(f"✓ Successfully uploaded experiment '{self.name}'")
+            print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'")
             print(f"  • Experiment ID: {experiment_id}")
             print(f"  • Spans uploaded: {len(spans)}")
             print(f"  • Metrics uploaded: {len(metrics)}")
@@ -542,81 +756,33 @@ def push(self) -> Dict[str, str]:
             conn.close()
 
 
-
-
-class ExperimentResults:
-    def __init__(self, dataset: Dataset) -> None:
-        self.dataset = dataset
-        self.experiment_rows = []
-
-    def __repr__(self) -> str:
-        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+"
-
-        def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
-            if isinstance(d, dict):
-                def truncate(value: str) -> str:
-                    return (value[:17] + '...') if len(value) > 20 else value
-
-                return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
-            elif isinstance(d, list):
-                return [str(item) for item in d]
-            else:
-                return [str(d)]
-
-        def format_entries(entries):
-            formatted_rows = []
-            for i, entry in enumerate(entries):
-                dataset_entry = self.dataset[i]
-                input_lines = format_dict(dataset_entry['input'])
-                expected_output_lines = format_dict(dataset_entry.get('expected_output', {}))
-                output_lines = format_dict(entry['output'])
-                evaluations_lines = format_dict(entry.get('evaluations', []))
-                
-                # Determine the maximum number of lines across all fields
-                max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines))
-                
-                # Pad the lists to have the same number of lines
-                input_lines += [''] * (max_lines - len(input_lines))
-                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
-                output_lines += [''] * (max_lines - len(output_lines))
-                evaluations_lines += [''] * (max_lines - len(evaluations_lines))
-                
-                for j in range(max_lines):
-                    if j == 0:
-                        index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
-                    else:
-                        index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
-                    formatted_rows.append(index)
-                formatted_rows.append(separator)
-            return "\n".join(formatted_rows)
-
-        if len(self.experiment_rows) <= 4:
-            entries = format_entries(self.experiment_rows)
-        else:
-            first_two = format_entries(self.experiment_rows[:2])
-            last_two = format_entries(self.experiment_rows[-2:])
-            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
-
-        table = (
-            f"{separator}\n"
-            f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n"
-            f"{separator}\n"
-            f"{entries}"
-        )
-        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
-
-    def __iter__(self) -> iter:
-        return iter(self.experiment_rows)
-    
-    def __len__(self) -> int:
-        return len(self.experiment_rows)
-
-    def __getitem__(self, index: int) -> Any:
-        return self.experiment_rows[index]
-
-
-
-def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, url: str, body: Any = None, context: str = "") -> Dict[str, Any]:
+def _make_request(
+    conn: HTTPSConnection,
+    headers: Dict[str, Any],
+    method: str,
+    url: str,
+    body: Optional[Any] = None,
+    context: str = ""
+) -> Dict[str, Any]:
+    """Make an HTTP request to the Datadog API.
+
+    Handles making HTTP requests to Datadog's API with proper error handling
+    and response parsing.
+
+    Args:
+        conn: The HTTP connection to use
+        headers: Request headers
+        method: HTTP method (GET, POST, etc.)
+        url: Request URL
+        body: Request body (optional)
+        context: Context string for error messages (optional)
+
+    Returns:
+        Dict[str, Any]: Parsed JSON response
+
+    Raises:
+        Exception: If the request fails, returns an error status, or returns invalid JSON
+    """
     if method == "GET":
         conn.request(method, url, headers=headers)
     else:
@@ -642,7 +808,56 @@ def _make_request(conn: HTTPSConnection, headers: Dict[str, Any], method: str, u
         raise Exception(error_message)
 
 def _make_id() -> str:
-    return hashlib.sha256(datetime.now().isoformat().encode('utf-8')).hexdigest()
+    """Generate a unique identifier.
+
+    Returns:
+        str: A random UUID as a hexadecimal string
+    """
+    return uuid.uuid4().hex
+
+def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
+    """Decorator that creates multiple versions of a function with different parameter combinations.
+    
+    Creates multiple versions of a function by generating all possible combinations
+    of the provided parameters. Each generated function variant includes tags
+    indicating its parameter values.
+    
+    Args:
+        **param_dict: Dictionary of parameter names and their possible values.
+                     Values can be single items or lists of possible values.
+        
+    Returns:
+        Callable: Decorator function that generates parameterized versions of the input function
+
+    Example:
+        @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7])
+        def my_function(text, model, temperature):
+            # This will create 4 versions of the function with different combinations
+            # of model and temperature parameters
+            pass
+    """
+    def decorator(func):
+        # Generate all combinations of parameters
+        param_names = list(param_dict.keys())
+        param_values = [param_dict[name] if isinstance(param_dict[name], (list, tuple)) 
+                       else [param_dict[name]] for name in param_names]
+        param_combinations = [dict(zip(param_names, combo)) 
+                            for combo in itertools.product(*param_values)]
+        
+        # Create a new function for each parameter combination
+        def create_parameterized_func(params):
+            def wrapped_func(*args, **kwargs):
+                return func(*args, **{**kwargs, **params})
+            
+            # Create a descriptive name for the function
+            param_str = "-".join(f"{k}={v}" for k, v in params.items())
+            wrapped_func.__name__ = f"{func.__name__}_{param_str}"
+            wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()]
+            return wrapped_func
+        
+        return [create_parameterized_func(combo) for combo in param_combinations]
+    
+    return decorator
 
 
 

From 88f05d3a2c16768bd8f0a2722869f7baeaa20f37 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 10:48:36 -0400
Subject: [PATCH 05/36] Format code

---
 ddtrace/llmobs/experiments.py | 416 +++++++++++++++++++---------------
 1 file changed, 236 insertions(+), 180 deletions(-)

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
index 5781b1e9572..ce159c752c8 100644
--- a/ddtrace/llmobs/experiments.py
+++ b/ddtrace/llmobs/experiments.py
@@ -25,7 +25,7 @@ def _validate_api_keys() -> None:
     for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]:
         if not os.getenv(key):
             missing_keys.append(key)
-    
+
     if missing_keys:
         raise ValueError(
             f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. "
@@ -46,7 +46,9 @@ class Dataset:
         datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed)
     """
 
-    def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
+    def __init__(
+        self, name: str, data: List[Dict[str, Any]], description: str = ""
+    ) -> None:
         self.name = name
         self._validate_data(data)
         self.data = data
@@ -54,7 +56,6 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "")
 
         # Post-push attributes
         self.datadog_dataset_id = None
-        
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self.data)
@@ -64,31 +65,30 @@ def __len__(self) -> int:
 
     def __getitem__(self, index: int) -> Dict[str, Any]:
         return self.data[index]
-        
-    
+
     def __repr__(self) -> str:
         header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n"
-        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+"
+        separator = "+" + "-" * 10 + "+" + "-" * 38 + "+" + "-" * 38 + "+"
 
         def format_dict(d: Dict[str, Any]) -> List[str]:
             def truncate(value: str) -> str:
-                return (value[:17] + '...') if len(value) > 20 else value
+                return (value[:17] + "...") if len(value) > 20 else value
 
             return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
 
-        def format_entries(entries):  # Fixed indentation - this was nested too deeply
+        def format_entries(entries):
             formatted_rows = []
             for i, entry in entries:
-                input_lines = format_dict(entry['input'])
-                expected_output_lines = format_dict(entry.get('expected_output', {}))
-                
+                input_lines = format_dict(entry["input"])
+                expected_output_lines = format_dict(entry.get("expected_output", {}))
+
                 # Determine the maximum number of lines in input and expected_output
                 max_lines = max(len(input_lines), len(expected_output_lines))
-                
+
                 # Pad the lists to have the same number of lines
-                input_lines += [''] * (max_lines - len(input_lines))
-                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
-                
+                input_lines += [""] * (max_lines - len(input_lines))
+                expected_output_lines += [""] * (max_lines - len(expected_output_lines))
+
                 for j in range(max_lines):
                     if j == 0:
                         index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |"
@@ -102,12 +102,14 @@ def format_entries(entries):  # Fixed indentation - this was nested too deeply
             entries = format_entries(enumerate(self.data))
         else:
             first_two = format_entries(enumerate(self.data[:2]))
-            last_two = format_entries(enumerate(self.data[-2:], start=len(self.data) - 2))
+            last_two = format_entries(
+                enumerate(self.data[-2:], start=len(self.data) - 2)
+            )
             entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
 
         table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}"
         return f"{header}\n{table if entries else 'No entries available.'}\n\n"
-    
+
     def _validate_data(self, data: List[Dict[str, Any]]) -> None:
         """Validate the format and structure of dataset records.
 
@@ -132,14 +134,18 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None:
         for row in data:
             if set(row.keys()) != first_row_keys:
                 raise ValueError("All rows must have the same keys.")
-            
+
             # Check that 'input' and 'expected_output' are flat dictionaries
-            for key in ['input', 'expected_output']:
-                if key in row and any(isinstance(value, dict) for value in row[key].values()):
-                    raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).")
+            for key in ["input", "expected_output"]:
+                if key in row and any(
+                    isinstance(value, dict) for value in row[key].values()
+                ):
+                    raise ValueError(
+                        f"'{key}' must be a flat dictionary (no nested dictionaries)."
+                    )
 
     @classmethod
-    def from_datadog(cls, name: str) -> 'Dataset':
+    def from_datadog(cls, name: str) -> "Dataset":
         """Create a dataset from a dataset hosted in Datadog.
 
         Args:
@@ -157,34 +163,40 @@ def from_datadog(cls, name: str) -> 'Dataset':
         headers = {
             "DD-API-KEY": os.getenv("DD_API_KEY"),
             "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
         }
 
         try:
             # Get dataset ID
             encoded_name = quote(name)
             url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
-            datasets = response_data.get('data', [])
+            response_data = _make_request(
+                conn, headers, "GET", url, context="Dataset lookup"
+            )
+            datasets = response_data.get("data", [])
 
             if not datasets:
                 raise ValueError(f"Dataset '{name}' not found")
 
-            dataset_id = datasets[0]['id']
+            dataset_id = datasets[0]["id"]
 
             # Get dataset records
             url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-            records_data = _make_request(conn, headers, "GET", url, context="Records lookup")
-            
+            records_data = _make_request(
+                conn, headers, "GET", url, context="Records lookup"
+            )
+
             # Transform records into the expected format
             class_records = []
-            for record in records_data.get('data', []):
-                attrs = record.get('attributes', {})
-                class_records.append({
-                    "input": attrs.get('input', {}),
-                    "expected_output": attrs.get('expected_output', {}),
-                    **attrs.get('metadata', {})
-                })
+            for record in records_data.get("data", []):
+                attrs = record.get("attributes", {})
+                class_records.append(
+                    {
+                        "input": attrs.get("input", {}),
+                        "expected_output": attrs.get("expected_output", {}),
+                        **attrs.get("metadata", {}),
+                    }
+                )
 
             # Create new dataset instance
             dataset = cls(name, class_records)
@@ -196,7 +208,7 @@ def from_datadog(cls, name: str) -> 'Dataset':
 
     def push(self) -> Dict[str, str]:
         """Push the dataset to Datadog.
-        
+
         Returns:
             Dict[str, str]: Dictionary containing dataset information including:
                 - dataset_id: The ID of the created/updated dataset
@@ -208,15 +220,17 @@ def push(self) -> Dict[str, str]:
         headers = {
             "DD-API-KEY": os.getenv("DD_API_KEY"),
             "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
         }
 
         try:
             # Check if dataset exists
             encoded_name = quote(self.name)
             url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
-            datasets = response_data.get('data', [])
+            response_data = _make_request(
+                conn, headers, "GET", url, context="Dataset lookup"
+            )
+            datasets = response_data.get("data", [])
 
             if not datasets:
                 # Create new dataset
@@ -226,9 +240,10 @@ def push(self) -> Dict[str, str]:
                         "type": "datasets",
                         "attributes": {
                             "name": self.name,
-                            "description": self.description or f"Dataset used for {self.name}",
-                            "metadata": {"team": "ml-obs"}
-                        }
+                            "description": self.description
+                            or f"Dataset used for {self.name}",
+                            "metadata": {"team": "ml-obs"},
+                        },
                     }
                 }
                 response_data = _make_request(
@@ -237,24 +252,26 @@ def push(self) -> Dict[str, str]:
                     "POST",
                     "/api/unstable/llm-obs/v1/datasets",
                     body=json.dumps(dataset_payload),
-                    context="Dataset creation"
+                    context="Dataset creation",
                 )
-                dataset_id = response_data['data']['id']
+                dataset_id = response_data["data"]["id"]
                 self.datadog_dataset_id = dataset_id
             else:
                 # Dataset exists, create a new version
-                dataset_id = datasets[0]['id']
+                dataset_id = datasets[0]["id"]
                 version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
                 new_dataset_name = f"{self.name}-{version_suffix}"
-                print(f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'.")
+                print(
+                    f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'."
+                )
                 dataset_payload = {
                     "data": {
                         "type": "datasets",
                         "attributes": {
                             "name": new_dataset_name,
                             "description": f"Dataset versioned on {version_suffix} used for {self.name}",
-                            "metadata": {"team": "ml-obs"}
-                        }
+                            "metadata": {"team": "ml-obs"},
+                        },
                     }
                 }
                 response_data = _make_request(
@@ -263,36 +280,36 @@ def push(self) -> Dict[str, str]:
                     "POST",
                     "/api/unstable/llm-obs/v1/datasets",
                     body=json.dumps(dataset_payload),
-                    context="Dataset version creation"
+                    context="Dataset version creation",
                 )
-                dataset_id = response_data['data']['id']
+                dataset_id = response_data["data"]["id"]
                 self.datadog_dataset_id = dataset_id
                 self.name = new_dataset_name
 
             # Add records to the dataset
             records_payload = {
-                "data": {
-                    "type": "datasets",
-                    "attributes": {
-                        "records": self.data
-                    }
-                }
+                "data": {"type": "datasets", "attributes": {"records": self.data}}
             }
             url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-            _make_request(conn, headers, "POST", url, body=json.dumps(records_payload), context="Adding records")
+            _make_request(
+                conn,
+                headers,
+                "POST",
+                url,
+                body=json.dumps(records_payload),
+                context="Adding records",
+            )
 
             print(f"✓ Successfully uploaded dataset '{self.name}'")
             print(f"  • Dataset ID: {dataset_id}")
             print(f"  • Records uploaded: {len(self.data)}")
-            
+
             return self
 
         finally:
             conn.close()
 
 
-
-
 class Experiment:
     """Manages the execution and evaluation of LLM tasks on a dataset.
 
@@ -310,7 +327,15 @@ class Experiment:
         results (ExperimentResults): Results after running the experiment
     """
 
-    def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List[Callable], tags: List[str] = [], project_name: str = "-") -> None:
+    def __init__(
+        self,
+        name: str,
+        task: Callable,
+        dataset: Dataset,
+        evaluators: List[Callable],
+        tags: List[str] = [],
+        project_name: str = "-",
+    ) -> None:
         self.name = name
         self.task = task
         self.dataset = dataset
@@ -321,15 +346,16 @@ def __init__(self, name: str, task: Callable, dataset: Dataset, evaluators: List
         self.has_run = False
         self.results = None
 
-
     def __repr__(self) -> str:
-        separator = "+" + "-"*20 + "+" + "-"*50 + "+"
-        
+        separator = "+" + "-" * 20 + "+" + "-" * 50 + "+"
+
         def format_evaluator(evaluator: Callable) -> str:
             return f"{evaluator.__name__}"
 
         evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators]
-        evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
+        evaluators = (
+            ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
+        )
 
         table = (
             f"{separator}\n"
@@ -358,9 +384,11 @@ def _validate_tags(self) -> None:
         """
         for tag in self.tags:
             if not isinstance(tag, str) or ":" not in tag:
-                raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.")
+                raise ValueError(
+                    f"Invalid tag format: {tag}. Tags should be in the format 'key:value'."
+                )
 
-    def run(self, _jobs: int = 10) -> 'ExperimentResults':
+    def run(self, _jobs: int = 10) -> "ExperimentResults":
         """Execute the experiment on the dataset.
 
         Runs the task function on each dataset record in parallel and collects
@@ -391,10 +419,11 @@ def process_row(idx_row):
                 end_time = time.time()
                 duration = end_time - start_time
 
-                # Evaluate the output
-                evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
+                evaluations = {
+                    evaluator.__name__: evaluator(row, output)
+                    for evaluator in self.evaluators
+                }
 
-                # Store the result
                 return {
                     "idx": idx,
                     "result": {
@@ -409,11 +438,10 @@ def process_row(idx_row):
                             "dataset_name": self.dataset.name,
                         },
                         "tags": self.tags,
-                        "error": None
-                    }
+                        "error": None,
+                    },
                 }
             except Exception as e:
-                # Handle exceptions and store the error
                 return {
                     "idx": idx,
                     "result": {
@@ -428,48 +456,47 @@ def process_row(idx_row):
                             "dataset_name": self.dataset.name,
                         },
                         "tags": self.tags,
-                        "error": str(e)
-                    }
+                        "error": str(e),
+                    },
                 }
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            # Create futures list first
             future_to_idx = {
-                executor.submit(process_row, (idx, row)): idx 
+                executor.submit(process_row, (idx, row)): idx
                 for idx, row in enumerate(self.dataset)
             }
-            
+
             # Process as they complete while maintaining order
             completed = 0
             results_buffer = [None] * total_rows
             for future in concurrent.futures.as_completed(future_to_idx):
                 idx = future_to_idx[future]
-                results_buffer[idx] = future.result()['result']
+                results_buffer[idx] = future.result()["result"]
                 completed += 1
 
                 # Update progress
                 progress = int(50 * completed / total_rows)
-                bar = '=' * progress + ' ' * (50 - progress)
+                bar = "=" * progress + " " * (50 - progress)
                 percent = int(100 * completed / total_rows)
-                sys.stdout.write(f'\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})')
+                sys.stdout.write(
+                    f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
+                )
                 sys.stdout.flush()
 
             # Add results in correct order
             results.experiment_rows = results_buffer
 
         # Print a new line after completion
-        sys.stdout.write('\n')
+        sys.stdout.write("\n")
 
         self.has_run = True
         self.results = results
         return results
 
-    def get_results(self) -> Union['ExperimentResults', List['ExperimentResults']]:
+    def get_results(self) -> Union["ExperimentResults", List["ExperimentResults"]]:
         if not self.has_run:
             raise ValueError("Experiment has not been run yet")
         return self.results
-    
-
 
 
 class ExperimentResults:
@@ -490,12 +517,25 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None:
         self.experiment_rows = []
 
     def __repr__(self) -> str:
-        separator = "+" + "-"*10 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+" + "-"*38 + "+"
+        separator = (
+            "+"
+            + "-" * 10
+            + "+"
+            + "-" * 38
+            + "+"
+            + "-" * 38
+            + "+"
+            + "-" * 38
+            + "+"
+            + "-" * 38
+            + "+"
+        )
 
         def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
             if isinstance(d, dict):
+
                 def truncate(value: str) -> str:
-                    return (value[:17] + '...') if len(value) > 20 else value
+                    return (value[:17] + "...") if len(value) > 20 else value
 
                 return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
             elif isinstance(d, list):
@@ -506,22 +546,29 @@ def truncate(value: str) -> str:
         def format_entries(entries):
             formatted_rows = []
             for i, entry in enumerate(entries):
-                dataset_idx = entry['metadata']['dataset_record_idx']
+                dataset_idx = entry["metadata"]["dataset_record_idx"]
                 dataset_entry = self.dataset[dataset_idx]
-                input_lines = format_dict(dataset_entry['input'])
-                expected_output_lines = format_dict(dataset_entry.get('expected_output', {}))
-                output_lines = format_dict(entry['output'])
-                evaluations_lines = format_dict(entry.get('evaluations', []))
-                
+                input_lines = format_dict(dataset_entry["input"])
+                expected_output_lines = format_dict(
+                    dataset_entry.get("expected_output", {})
+                )
+                output_lines = format_dict(entry["output"])
+                evaluations_lines = format_dict(entry.get("evaluations", []))
+
                 # Determine the maximum number of lines across all fields
-                max_lines = max(len(input_lines), len(expected_output_lines), len(output_lines), len(evaluations_lines))
-                
+                max_lines = max(
+                    len(input_lines),
+                    len(expected_output_lines),
+                    len(output_lines),
+                    len(evaluations_lines),
+                )
+
                 # Pad the lists to have the same number of lines
-                input_lines += [''] * (max_lines - len(input_lines))
-                expected_output_lines += [''] * (max_lines - len(expected_output_lines))
-                output_lines += [''] * (max_lines - len(output_lines))
-                evaluations_lines += [''] * (max_lines - len(evaluations_lines))
-                
+                input_lines += [""] * (max_lines - len(input_lines))
+                expected_output_lines += [""] * (max_lines - len(expected_output_lines))
+                output_lines += [""] * (max_lines - len(output_lines))
+                evaluations_lines += [""] * (max_lines - len(evaluations_lines))
+
                 for j in range(max_lines):
                     if j == 0:
                         index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
@@ -544,20 +591,22 @@ def format_entries(entries):
             f"{separator}\n"
             f"{entries}"
         )
-        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
+        return (
+            f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
+        )
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self.experiment_rows)
-    
+
     def __len__(self) -> int:
         return len(self.experiment_rows)
 
     def __getitem__(self, index: int) -> Any:
         return self.experiment_rows[index]
-    
+
     def push(self) -> Dict[str, str]:
         """Push the experiment results to Datadog.
-        
+
         Returns:
             Dict[str, str]: Dictionary containing experiment information including:
                 - experiment_id: The ID of the created experiment
@@ -565,32 +614,36 @@ def push(self) -> Dict[str, str]:
                 - span_count: Number of spans uploaded
         """
         _validate_api_keys()
-        
+
         # Initialize connection and headers
         conn = HTTPSConnection(BASE_URL)
         headers = {
             "DD-API-KEY": os.getenv("DD_API_KEY"),
             "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json"
+            "Content-Type": "application/json",
         }
 
         try:
             # Check if project exists
             url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Project lookup")
-            projects = response_data.get('data', [])
+            response_data = _make_request(
+                conn, headers, "GET", url, context="Project lookup"
+            )
+            projects = response_data.get("data", [])
 
             if not projects:
                 # Create new project
-                print(f"Project '{self.experiment.project_name}' not found. Creating it.")
+                print(
+                    f"Project '{self.experiment.project_name}' not found. Creating it."
+                )
                 project_payload = {
                     "data": {
                         "type": "projects",
                         "attributes": {
                             "name": self.experiment.project_name,
                             "description": f"Project for {self.experiment.project_name}",
-                            "metadata": {"team": "ml-obs"}
-                        }
+                            "metadata": {"team": "ml-obs"},
+                        },
                     }
                 }
                 response_data = _make_request(
@@ -599,17 +652,19 @@ def push(self) -> Dict[str, str]:
                     "POST",
                     "/api/unstable/llm-obs/v1/projects",
                     body=json.dumps(project_payload),
-                    context="Project creation"
+                    context="Project creation",
                 )
-                project_id = response_data['data']['id']
+                project_id = response_data["data"]["id"]
             else:
-                project_id = projects[0]['id']
+                project_id = projects[0]["id"]
 
             # Check if experiment exists
             encoded_name = quote(self.experiment.name)
             url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup")
-            experiments = response_data.get('data', [])
+            response_data = _make_request(
+                conn, headers, "GET", url, context="Experiment lookup"
+            )
+            experiments = response_data.get("data", [])
 
             if not experiments:
                 # Create new experiment
@@ -624,9 +679,9 @@ def push(self) -> Dict[str, str]:
                             "project_id": project_id,
                             "metadata": {
                                 "tags": self.experiment.tags,
-                                "team": "ml-obs"
-                            }
-                        }
+                                "team": "ml-obs",
+                            },
+                        },
                     }
                 }
                 response_data = _make_request(
@@ -635,14 +690,16 @@ def push(self) -> Dict[str, str]:
                     "POST",
                     "/api/unstable/llm-obs/v1/experiments",
                     body=json.dumps(experiment_payload),
-                    context="Experiment creation"
+                    context="Experiment creation",
                 )
-                experiment_id = response_data['data']['id']
+                experiment_id = response_data["data"]["id"]
             else:
                 # Experiment exists, create a new version
                 version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
                 new_experiment_name = f"{self.experiment.name}-{version_suffix}"
-                print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.")
+                print(
+                    f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'."
+                )
                 experiment_payload = {
                     "data": {
                         "type": "experiments",
@@ -653,9 +710,9 @@ def push(self) -> Dict[str, str]:
                             "project_id": project_id,
                             "metadata": {
                                 "tags": self.experiment.tags,
-                                "team": "ml-obs"
-                            }
-                        }
+                                "team": "ml-obs",
+                            },
+                        },
                     }
                 }
                 response_data = _make_request(
@@ -664,51 +721,49 @@ def push(self) -> Dict[str, str]:
                     "POST",
                     "/api/unstable/llm-obs/v1/experiments",
                     body=json.dumps(experiment_payload),
-                    context="Experiment version creation"
+                    context="Experiment version creation",
                 )
-                experiment_id = response_data['data']['id']
+                experiment_id = response_data["data"]["id"]
                 self.experiment.name = new_experiment_name
 
-            # Prepare and send experiment results
             spans = []
             metrics = []
 
-            
-            
             for idx, result in enumerate(self.experiment_rows):
-                
+
                 span = {
                     "span_id": _make_id(),
                     "project_id": project_id,
                     "experiment_id": experiment_id,
                     "dataset_id": self.experiment.dataset.datadog_dataset_id,
                     "dataset_record_id": _make_id(),
-                    "start_ns": int(result['metadata']['timestamp'] * 1e9),
-                    "duration": float(result['metadata']['duration'] * 1e9),
+                    "start_ns": int(result["metadata"]["timestamp"] * 1e9),
+                    "duration": float(result["metadata"]["duration"] * 1e9),
                     "tags": self.experiment.tags,
                     "status": "ok",
                     "meta": {
                         "span": {"kind": "experiment"},
-                        "input": self.experiment.dataset[idx]['input'],
-                        "output": result['output'],
-                        "expected_output": self.experiment.dataset[idx].get('expected_output', {}),
+                        "input": self.experiment.dataset[idx]["input"],
+                        "output": result["output"],
+                        "expected_output": self.experiment.dataset[idx].get(
+                            "expected_output", {}
+                        ),
                         "error": {
-                            "message": result['error'],
+                            "message": result["error"],
                             "stack": None,
-                            "type": None 
-                        }
-                    }
-                    
+                            "type": None,
+                        },
+                    },
                 }
                 spans.append(span)
 
                 # Add evaluation metrics
-                for metric_name, metric_value in result['evaluations'].items():
-                    timestamp_ms = int(result['metadata']['timestamp'] * 1000)
-                    
+                for metric_name, metric_value in result["evaluations"].items():
+                    timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
+
                     if isinstance(metric_value, bool):
                         metric_value = 1 if metric_value else 0
-                        metric_type = "score" 
+                        metric_type = "score"
                     elif isinstance(metric_value, (int, float)):
                         metric_type = "score"
                     else:
@@ -716,25 +771,23 @@ def push(self) -> Dict[str, str]:
                         metric_value = str(metric_value)
 
                     metric = {
-                        "span_id": span['span_id'],
+                        "span_id": span["span_id"],
                         "metric_type": metric_type,
                         "timestamp_ms": timestamp_ms,
                         "label": metric_name,
-                        "score_value" if metric_type == "score" else "categorical_value": metric_value
+                        "score_value"
+                        if metric_type == "score"
+                        else "categorical_value": metric_value,
                     }
                     metrics.append(metric)
 
             results_payload = {
                 "data": {
                     "type": "experiments",
-                    "attributes": {
-                        "spans": spans,
-                        "metrics": metrics
-                    }
+                    "attributes": {"spans": spans, "metrics": metrics},
                 }
             }
 
-
             url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
             _make_request(
                 conn,
@@ -742,14 +795,16 @@ def push(self) -> Dict[str, str]:
                 "POST",
                 url,
                 body=json.dumps(results_payload),
-                context="Publishing results"
+                context="Publishing results",
             )
 
-            print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'")
+            print(
+                f"✓ Successfully uploaded experiment results for '{self.experiment.name}'"
+            )
             print(f"  • Experiment ID: {experiment_id}")
             print(f"  • Spans uploaded: {len(spans)}")
             print(f"  • Metrics uploaded: {len(metrics)}")
-            
+
             return self
 
         finally:
@@ -762,7 +817,7 @@ def _make_request(
     method: str,
     url: str,
     body: Optional[Any] = None,
-    context: str = ""
+    context: str = "",
 ) -> Dict[str, Any]:
     """Make an HTTP request to the Datadog API.
 
@@ -787,26 +842,26 @@ def _make_request(
         conn.request(method, url, headers=headers)
     else:
         if body is not None and isinstance(body, str):
-            body = body.encode('utf-8')
+            body = body.encode("utf-8")
         conn.request(method, url, body=body, headers=headers)
-    
+
     response = conn.getresponse()
     response_body = response.read()
-    
+
     if response.status >= 400:
         error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}"
         raise Exception(error_message)
-    
-    # Add handling for empty response
+
     if not response_body:
-        return {}  # Return empty dict for empty responses
-        
+        return {}
+
     try:
         return json.loads(response_body)
     except json.JSONDecodeError:
         error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}"
         raise Exception(error_message)
 
+
 def _make_id() -> str:
     """Generate a unique identifier.
 
@@ -815,17 +870,18 @@ def _make_id() -> str:
     """
     return uuid.uuid4().hex
 
+
 def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
     """Decorator that creates multiple versions of a function with different parameter combinations.
-    
+
     Creates multiple versions of a function by generating all possible combinations
     of the provided parameters. Each generated function variant includes tags
     indicating its parameter values.
-    
+
     Args:
         **param_dict: Dictionary of parameter names and their possible values.
                      Values can be single items or lists of possible values.
-        
+
     Returns:
         Callable: Decorator function that generates parameterized versions of the input function
 
@@ -836,30 +892,30 @@ def my_function(text, model, temperature):
             # of model and temperature parameters
             pass
     """
+
     def decorator(func):
         # Generate all combinations of parameters
         param_names = list(param_dict.keys())
-        param_values = [param_dict[name] if isinstance(param_dict[name], (list, tuple)) 
-                       else [param_dict[name]] for name in param_names]
-        param_combinations = [dict(zip(param_names, combo)) 
-                            for combo in itertools.product(*param_values)]
-        
+        param_values = [
+            param_dict[name]
+            if isinstance(param_dict[name], (list, tuple))
+            else [param_dict[name]]
+            for name in param_names
+        ]
+        param_combinations = [
+            dict(zip(param_names, combo)) for combo in itertools.product(*param_values)
+        ]
+
         # Create a new function for each parameter combination
         def create_parameterized_func(params):
             def wrapped_func(*args, **kwargs):
                 return func(*args, **{**kwargs, **params})
-            
-            # Create a descriptive name for the function
+
             param_str = "-".join(f"{k}={v}" for k, v in params.items())
             wrapped_func.__name__ = f"{func.__name__}_{param_str}"
             wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()]
             return wrapped_func
-        
-        return [create_parameterized_func(combo) for combo in param_combinations]
-    
-    return decorator
-
-
-
 
+        return [create_parameterized_func(combo) for combo in param_combinations]
 
+    return decorator

From e73a897cb892c4b7dfbfe39eb335f0a8881c34ec Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 10:55:53 -0400
Subject: [PATCH 06/36] Add custom exception classes

---
 ddtrace/llmobs/experiments.py | 181 ++++++++++++++++------------------
 1 file changed, 86 insertions(+), 95 deletions(-)

diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments.py
index ce159c752c8..93ec0bab7f6 100644
--- a/ddtrace/llmobs/experiments.py
+++ b/ddtrace/llmobs/experiments.py
@@ -15,24 +15,6 @@
 BASE_URL = "api.datadoghq.com"
 
 
-def _validate_api_keys() -> None:
-    """Validate that required Datadog API keys are set in environment variables.
-
-    Raises:
-        ValueError: If any required API keys are missing from environment variables
-    """
-    missing_keys = []
-    for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]:
-        if not os.getenv(key):
-            missing_keys.append(key)
-
-    if missing_keys:
-        raise ValueError(
-            f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. "
-            "Please set these environment variables before pushing to Datadog."
-        )
-
-
 class Dataset:
     """A container for LLM experiment data that can be pushed to and retrieved from Datadog.
 
@@ -68,7 +50,7 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
 
     def __repr__(self) -> str:
         header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n"
-        separator = "+" + "-" * 10 + "+" + "-" * 38 + "+" + "-" * 38 + "+"
+        separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+"
 
         def format_dict(d: Dict[str, Any]) -> List[str]:
             def truncate(value: str) -> str:
@@ -347,7 +329,7 @@ def __init__(
         self.results = None
 
     def __repr__(self) -> str:
-        separator = "+" + "-" * 20 + "+" + "-" * 50 + "+"
+        separator = f"+{'-' * 20}+{'-' * 50}+"
 
         def format_evaluator(evaluator: Callable) -> str:
             return f"{evaluator.__name__}"
@@ -517,19 +499,7 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None:
         self.experiment_rows = []
 
     def __repr__(self) -> str:
-        separator = (
-            "+"
-            + "-" * 10
-            + "+"
-            + "-" * 38
-            + "+"
-            + "-" * 38
-            + "+"
-            + "-" * 38
-            + "+"
-            + "-" * 38
-            + "+"
-        )
+        separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+{'-' * 38}+{'-' * 38}+"
 
         def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
             if isinstance(d, dict):
@@ -591,9 +561,7 @@ def format_entries(entries):
             f"{separator}\n"
             f"{entries}"
         )
-        return (
-            f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
-        )
+        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self.experiment_rows)
@@ -811,6 +779,57 @@ def push(self) -> Dict[str, str]:
             conn.close()
 
 
+def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
+    """Decorator that creates multiple versions of a function with different parameter combinations.
+
+    Creates multiple versions of a function by generating all possible combinations
+    of the provided parameters. Each generated function variant includes tags
+    indicating its parameter values.
+
+    Args:
+        **param_dict: Dictionary of parameter names and their possible values.
+                     Values can be single items or lists of possible values.
+
+    Returns:
+        Callable: Decorator function that generates parameterized versions of the input function
+
+    Example:
+        @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7])
+        def my_function(text, model, temperature):
+            # This will create 4 versions of the function with different combinations
+            # of model and temperature parameters
+            pass
+    """
+
+    def decorator(func):
+        # Generate all combinations of parameters
+        param_names = list(param_dict.keys())
+        param_values = [
+            param_dict[name]
+            if isinstance(param_dict[name], (list, tuple))
+            else [param_dict[name]]
+            for name in param_names
+        ]
+        param_combinations = [
+            dict(zip(param_names, combo)) for combo in itertools.product(*param_values)
+        ]
+
+        # Create a new function for each parameter combination
+        def create_parameterized_func(params):
+            def wrapped_func(*args, **kwargs):
+                return func(*args, **{**kwargs, **params})
+
+            param_str = "-".join(f"{k}={v}" for k, v in params.items())
+            wrapped_func.__name__ = f"{func.__name__}_{param_str}"
+            wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()]
+            return wrapped_func
+
+        return [create_parameterized_func(combo) for combo in param_combinations]
+
+    return decorator
+
+
+
 def _make_request(
     conn: HTTPSConnection,
     headers: Dict[str, Any],
@@ -821,22 +840,9 @@ def _make_request(
 ) -> Dict[str, Any]:
     """Make an HTTP request to the Datadog API.
 
-    Handles making HTTP requests to Datadog's API with proper error handling
-    and response parsing.
-
-    Args:
-        conn: The HTTP connection to use
-        headers: Request headers
-        method: HTTP method (GET, POST, etc.)
-        url: Request URL
-        body: Request body (optional)
-        context: Context string for error messages (optional)
-
-    Returns:
-        Dict[str, Any]: Parsed JSON response
-
     Raises:
-        Exception: If the request fails, returns an error status, or returns invalid JSON
+        DatadogAPIError: If the request fails or returns an error status
+        DatadogResponseError: If the response contains invalid JSON
     """
     if method == "GET":
         conn.request(method, url, headers=headers)
@@ -847,10 +853,11 @@ def _make_request(
 
     response = conn.getresponse()
     response_body = response.read()
+    response_text = response_body.decode('utf-8')
 
     if response.status >= 400:
-        error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse body: {response_body.decode('utf-8')}"
-        raise Exception(error_message)
+        error_message = f"HTTP {response.status} Error during {context}: {response.reason}"
+        raise DatadogAPIError(error_message, status_code=response.status, response=response_text)
 
     if not response_body:
         return {}
@@ -858,8 +865,8 @@ def _make_request(
     try:
         return json.loads(response_body)
     except json.JSONDecodeError:
-        error_message = f"Invalid JSON response during {context}. Status: {response.status}\nResponse body: {response_body.decode('utf-8')}"
-        raise Exception(error_message)
+        error_message = f"Invalid JSON response during {context}. Status: {response.status}"
+        raise DatadogResponseError(error_message, raw_response=response_text)
 
 
 def _make_id() -> str:
@@ -871,51 +878,35 @@ def _make_id() -> str:
     return uuid.uuid4().hex
 
 
-def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
-    """Decorator that creates multiple versions of a function with different parameter combinations.
+class DatadogAPIError(Exception):
+    """Raised when there is an error interacting with the Datadog API."""
+    def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None):
+        self.status_code = status_code
+        self.response = response
+        super().__init__(message)
 
-    Creates multiple versions of a function by generating all possible combinations
-    of the provided parameters. Each generated function variant includes tags
-    indicating its parameter values.
+class DatadogResponseError(Exception):
+    """Raised when there is an error parsing the response from Datadog."""
+    def __init__(self, message: str, raw_response: Optional[str] = None):
+        self.raw_response = raw_response
+        super().__init__(message)
 
-    Args:
-        **param_dict: Dictionary of parameter names and their possible values.
-                     Values can be single items or lists of possible values.
 
-    Returns:
-        Callable: Decorator function that generates parameterized versions of the input function
+def _validate_api_keys() -> None:
+    """Validate that required Datadog API keys are set in environment variables.
 
-    Example:
-        @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7])
-        def my_function(text, model, temperature):
-            # This will create 4 versions of the function with different combinations
-            # of model and temperature parameters
-            pass
+    Raises:
+        ValueError: If any required API keys are missing from environment variables
     """
+    missing_keys = []
+    for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]:
+        if not os.getenv(key):
+            missing_keys.append(key)
 
-    def decorator(func):
-        # Generate all combinations of parameters
-        param_names = list(param_dict.keys())
-        param_values = [
-            param_dict[name]
-            if isinstance(param_dict[name], (list, tuple))
-            else [param_dict[name]]
-            for name in param_names
-        ]
-        param_combinations = [
-            dict(zip(param_names, combo)) for combo in itertools.product(*param_values)
-        ]
-
-        # Create a new function for each parameter combination
-        def create_parameterized_func(params):
-            def wrapped_func(*args, **kwargs):
-                return func(*args, **{**kwargs, **params})
-
-            param_str = "-".join(f"{k}={v}" for k, v in params.items())
-            wrapped_func.__name__ = f"{func.__name__}_{param_str}"
-            wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()]
-            return wrapped_func
+    if missing_keys:
+        raise ValueError(
+            f"Missing required Datadog API keys in environment variables: {', '.join(missing_keys)}. "
+            "Please set these environment variables before pushing to Datadog."
+        )
 
-        return [create_parameterized_func(combo) for combo in param_combinations]
 
-    return decorator

From f8c9ef0ebd182a5c2b954e4752c3f7a1dcc3c350 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 11:24:23 -0400
Subject: [PATCH 07/36] Move code to another directory

---
 ddtrace/llmobs/experiments/__init__.py                         | 3 +++
 ddtrace/llmobs/{experiments.py => experiments/_experiments.py} | 0
 2 files changed, 3 insertions(+)
 create mode 100644 ddtrace/llmobs/experiments/__init__.py
 rename ddtrace/llmobs/{experiments.py => experiments/_experiments.py} (100%)

diff --git a/ddtrace/llmobs/experiments/__init__.py b/ddtrace/llmobs/experiments/__init__.py
new file mode 100644
index 00000000000..2979d72aebf
--- /dev/null
+++ b/ddtrace/llmobs/experiments/__init__.py
@@ -0,0 +1,3 @@
+from ._experiments import Dataset, Experiment, parametrize
+
+__all__ = ["Dataset", "Experiment", "parametrize"]
\ No newline at end of file
diff --git a/ddtrace/llmobs/experiments.py b/ddtrace/llmobs/experiments/_experiments.py
similarity index 100%
rename from ddtrace/llmobs/experiments.py
rename to ddtrace/llmobs/experiments/_experiments.py

From 59577e1bcedaa7a5f728580581d59bda032f88ec Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 11:29:31 -0400
Subject: [PATCH 08/36] Change experiments module export

---
 ddtrace/llmobs/{experiments => }/_experiments.py | 0
 ddtrace/llmobs/experiments/__init__.py           | 3 ---
 2 files changed, 3 deletions(-)
 rename ddtrace/llmobs/{experiments => }/_experiments.py (100%)
 delete mode 100644 ddtrace/llmobs/experiments/__init__.py

diff --git a/ddtrace/llmobs/experiments/_experiments.py b/ddtrace/llmobs/_experiments.py
similarity index 100%
rename from ddtrace/llmobs/experiments/_experiments.py
rename to ddtrace/llmobs/_experiments.py
diff --git a/ddtrace/llmobs/experiments/__init__.py b/ddtrace/llmobs/experiments/__init__.py
deleted file mode 100644
index 2979d72aebf..00000000000
--- a/ddtrace/llmobs/experiments/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from ._experiments import Dataset, Experiment, parametrize
-
-__all__ = ["Dataset", "Experiment", "parametrize"]
\ No newline at end of file

From 402d4021d61a1e2ca54927b3accac44c7fd63ee3 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 11:31:41 -0400
Subject: [PATCH 09/36] Use f strings

---
 ddtrace/llmobs/_experiments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 93ec0bab7f6..572c8ab5870 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -458,7 +458,7 @@ def process_row(idx_row):
 
                 # Update progress
                 progress = int(50 * completed / total_rows)
-                bar = "=" * progress + " " * (50 - progress)
+                bar = f"{'=' * progress}{' ' * (50 - progress)}"
                 percent = int(100 * completed / total_rows)
                 sys.stdout.write(
                     f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"

From 2c281c5df225332c7b9344f8c3f2f1b4749a91af Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 29 Oct 2024 16:48:19 -0400
Subject: [PATCH 10/36] Decouple running from evaluating

---
 ddtrace/llmobs/_experiments.py | 162 ++++++++++++++++++++++++---------
 1 file changed, 118 insertions(+), 44 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 572c8ab5870..41a5cef3804 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -306,7 +306,9 @@ class Experiment:
         tags (List[str]): Tags for organizing experiments
         project_name (str): Name of the project this experiment belongs to
         has_run (bool): Whether the experiment has been executed
-        results (ExperimentResults): Results after running the experiment
+        has_evaluated (bool): Whether the evaluations have been performed
+        outputs (List[Dict]): Outputs after running the task
+        results (ExperimentResults): Results after running evaluations
     """
 
     def __init__(
@@ -322,10 +324,12 @@ def __init__(
         self.task = task
         self.dataset = dataset
         self.evaluators = evaluators
-        self.tags = []
+        self.tags = tags
         self.project_name = project_name
         # Post-run attributes
         self.has_run = False
+        self.has_evaluated = False
+        self.outputs = []
         self.results = None
 
     def __repr__(self) -> str:
@@ -370,26 +374,23 @@ def _validate_tags(self) -> None:
                     f"Invalid tag format: {tag}. Tags should be in the format 'key:value'."
                 )
 
-    def run(self, _jobs: int = 10) -> "ExperimentResults":
-        """Execute the experiment on the dataset.
+    def run(self, _jobs: int = 10) -> None:
+        """Execute the experiment tasks on the dataset without performing evaluations.
 
-        Runs the task function on each dataset record in parallel and collects
-        results and evaluations.
+        Runs the task function on each dataset record in parallel and stores
+        the outputs and metadata.
 
         Args:
             _jobs (int, optional): Number of parallel workers. Defaults to 10.
                 Must be between 1 and 20.
 
-        Returns:
-            ExperimentResults: Object containing the experiment results
-
         Raises:
             ValueError: If _jobs is not between 1 and 20
         """
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
 
-        results = ExperimentResults(self.dataset, self)
+        self.outputs = []
         total_rows = len(self.dataset)
 
         def process_row(idx_row):
@@ -401,42 +402,117 @@ def process_row(idx_row):
                 end_time = time.time()
                 duration = end_time - start_time
 
+                return {
+                    "idx": idx,
+                    "output": output,
+                    "metadata": {
+                        "timestamp": start_time,
+                        "duration": duration,
+                        "dataset_record_idx": idx,
+                        "project_name": self.project_name,
+                        "experiment_name": self.name,
+                        "dataset_name": self.dataset.name,
+                    },
+                    "error": None,
+                }
+            except Exception as e:
+                return {
+                    "idx": idx,
+                    "output": None,
+                    "metadata": {
+                        "timestamp": time.time(),
+                        "duration": 0,
+                        "dataset_record_idx": idx,
+                        "project_name": self.project_name,
+                        "experiment_name": self.name,
+                        "dataset_name": self.dataset.name,
+                    },
+                    "error": str(e),
+                }
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
+            future_to_idx = {
+                executor.submit(process_row, (idx, row)): idx
+                for idx, row in enumerate(self.dataset)
+            }
+
+            # Process as they complete while maintaining order
+            completed = 0
+            outputs_buffer = [None] * total_rows
+            for future in concurrent.futures.as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                outputs_buffer[idx] = future.result()
+                completed += 1
+
+                # Update progress
+                progress = int(50 * completed / total_rows)
+                bar = f"{'=' * progress}{' ' * (50 - progress)}"
+                percent = int(100 * completed / total_rows)
+                sys.stdout.write(
+                    f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
+                )
+                sys.stdout.flush()
+
+            self.outputs = outputs_buffer
+
+        sys.stdout.write("\n")
+
+        self.has_run = True
+
+        return self
+
+    def eval(self, _jobs: int = 10) -> "ExperimentResults":
+        """Evaluate the outputs using the provided evaluators.
+
+        Runs the evaluators on each output in parallel and collects evaluations.
+
+        Args:
+            _jobs (int, optional): Number of parallel workers. Defaults to 10.
+                Must be between 1 and 20.
+
+        Returns:
+            ExperimentResults: Object containing the experiment results
+
+        Raises:
+            ValueError: If _jobs is not between 1 and 20
+            ValueError: If the experiment has not been run yet
+        """
+        if not 1 <= _jobs <= 20:
+            raise ValueError("Number of jobs must be between 1 and 20")
+
+        if not self.has_run:
+            raise ValueError("Experiment has not been run yet. Please call run() before eval().")
+
+        results = ExperimentResults(self.dataset, self)
+        total_rows = len(self.outputs)
+
+        def evaluate_output(idx_output):
+            idx, output_data = idx_output
+            try:
+                idx_in_dataset = output_data["metadata"]["dataset_record_idx"]
+                row = self.dataset[idx_in_dataset]
+                output = output_data["output"]
                 evaluations = {
                     evaluator.__name__: evaluator(row, output)
                     for evaluator in self.evaluators
                 }
 
-                return {
-                    "idx": idx,
-                    "result": {
-                        "output": output,
-                        "evaluations": evaluations,
-                        "metadata": {
-                            "timestamp": start_time,
-                            "duration": duration,
-                            "dataset_record_idx": idx,
-                            "project_name": self.project_name,
-                            "experiment_name": self.name,
-                            "dataset_name": self.dataset.name,
-                        },
-                        "tags": self.tags,
-                        "error": None,
-                    },
+                result = {
+                    "output": output,
+                    "evaluations": evaluations,
+                    "metadata": output_data["metadata"],
+                    "tags": self.tags,
+                    "error": output_data["error"],
                 }
+
+                return {"idx": idx, "result": result}
             except Exception as e:
                 return {
                     "idx": idx,
                     "result": {
-                        "output": None,
+                        "output": output_data["output"],
                         "evaluations": {},
-                        "metadata": {
-                            "timestamp": time.time(),
-                            "duration": 0,
-                            "dataset_record_idx": idx,
-                            "project_name": self.project_name,
-                            "experiment_name": self.name,
-                            "dataset_name": self.dataset.name,
-                        },
+                        "metadata": output_data["metadata"],
                         "tags": self.tags,
                         "error": str(e),
                     },
@@ -444,8 +520,8 @@ def process_row(idx_row):
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
             future_to_idx = {
-                executor.submit(process_row, (idx, row)): idx
-                for idx, row in enumerate(self.dataset)
+                executor.submit(evaluate_output, (idx, output_data)): idx
+                for idx, output_data in enumerate(self.outputs)
             }
 
             # Process as they complete while maintaining order
@@ -461,23 +537,21 @@ def process_row(idx_row):
                 bar = f"{'=' * progress}{' ' * (50 - progress)}"
                 percent = int(100 * completed / total_rows)
                 sys.stdout.write(
-                    f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
+                    f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
                 )
                 sys.stdout.flush()
 
-            # Add results in correct order
             results.experiment_rows = results_buffer
 
-        # Print a new line after completion
         sys.stdout.write("\n")
 
-        self.has_run = True
+        self.has_evaluated = True
         self.results = results
         return results
 
-    def get_results(self) -> Union["ExperimentResults", List["ExperimentResults"]]:
-        if not self.has_run:
-            raise ValueError("Experiment has not been run yet")
+    def get_results(self) -> 'ExperimentResults':
+        if not self.has_evaluated:
+            raise ValueError("Evaluations have not been performed yet. Please call eval() after run().")
         return self.results
 
 

From 0e421da52b95f539661ec51c3f1c4e42b73ff064 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 4 Nov 2024 15:47:09 -0500
Subject: [PATCH 11/36] Change parametrize function to make it simpler

---
 ddtrace/llmobs/_experiments.py | 219 +++++++++++++++++----------------
 1 file changed, 112 insertions(+), 107 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 41a5cef3804..ed4f23ff363 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -216,14 +216,12 @@ def push(self) -> Dict[str, str]:
 
             if not datasets:
                 # Create new dataset
-                print(f"Dataset '{self.name}' not found. Creating it.")
                 dataset_payload = {
                     "data": {
                         "type": "datasets",
                         "attributes": {
                             "name": self.name,
-                            "description": self.description
-                            or f"Dataset used for {self.name}",
+                            "description": self.description,
                             "metadata": {"team": "ml-obs"},
                         },
                     }
@@ -239,34 +237,11 @@ def push(self) -> Dict[str, str]:
                 dataset_id = response_data["data"]["id"]
                 self.datadog_dataset_id = dataset_id
             else:
-                # Dataset exists, create a new version
-                dataset_id = datasets[0]["id"]
-                version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-                new_dataset_name = f"{self.name}-{version_suffix}"
-                print(
-                    f"Dataset '{self.name}' found. Creating new version '{new_dataset_name}'."
-                )
-                dataset_payload = {
-                    "data": {
-                        "type": "datasets",
-                        "attributes": {
-                            "name": new_dataset_name,
-                            "description": f"Dataset versioned on {version_suffix} used for {self.name}",
-                            "metadata": {"team": "ml-obs"},
-                        },
-                    }
-                }
-                response_data = _make_request(
-                    conn,
-                    headers,
-                    "POST",
-                    "/api/unstable/llm-obs/v1/datasets",
-                    body=json.dumps(dataset_payload),
-                    context="Dataset version creation",
+                # Dataset exists, raise error
+                raise ValueError(
+                    f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. "
+                    "Please use a different name for your dataset."
                 )
-                dataset_id = response_data["data"]["id"]
-                self.datadog_dataset_id = dataset_id
-                self.name = new_dataset_name
 
             # Add records to the dataset
             records_payload = {
@@ -319,6 +294,8 @@ def __init__(
         evaluators: List[Callable],
         tags: List[str] = [],
         project_name: str = "-",
+        description: str = "",
+        metadata: Dict[str, Any] = {},
     ) -> None:
         self.name = name
         self.task = task
@@ -326,6 +303,8 @@ def __init__(
         self.evaluators = evaluators
         self.tags = tags
         self.project_name = project_name
+        self.description = description
+        self.metadata = metadata
         # Post-run attributes
         self.has_run = False
         self.has_evaluated = False
@@ -459,7 +438,7 @@ def process_row(idx_row):
 
         self.has_run = True
 
-        return self
+        return self.eval()
 
     def eval(self, _jobs: int = 10) -> "ExperimentResults":
         """Evaluate the outputs using the provided evaluators.
@@ -532,15 +511,6 @@ def evaluate_output(idx_output):
                 results_buffer[idx] = future.result()["result"]
                 completed += 1
 
-                # Update progress
-                progress = int(50 * completed / total_rows)
-                bar = f"{'=' * progress}{' ' * (50 - progress)}"
-                percent = int(100 * completed / total_rows)
-                sys.stdout.write(
-                    f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
-                )
-                sys.stdout.flush()
-
             results.experiment_rows = results_buffer
 
         sys.stdout.write("\n")
@@ -675,15 +645,12 @@ def push(self) -> Dict[str, str]:
 
             if not projects:
                 # Create new project
-                print(
-                    f"Project '{self.experiment.project_name}' not found. Creating it."
-                )
                 project_payload = {
                     "data": {
                         "type": "projects",
                         "attributes": {
                             "name": self.experiment.project_name,
-                            "description": f"Project for {self.experiment.project_name}",
+                            "description": "",
                             "metadata": {"team": "ml-obs"},
                         },
                     }
@@ -710,18 +677,17 @@ def push(self) -> Dict[str, str]:
 
             if not experiments:
                 # Create new experiment
-                print(f"Experiment '{self.experiment.name}' not found. Creating it.")
                 experiment_payload = {
                     "data": {
                         "type": "experiments",
                         "attributes": {
                             "name": self.experiment.name,
-                            "description": f"Experiment: {self.experiment.name} on dataset: {self.experiment.dataset.name}",
+                            "description": self.experiment.description,
                             "dataset_id": self.experiment.dataset.datadog_dataset_id,
                             "project_id": project_id,
                             "metadata": {
                                 "tags": self.experiment.tags,
-                                "team": "ml-obs",
+                                **self.experiment.metadata,
                             },
                         },
                     }
@@ -747,12 +713,12 @@ def push(self) -> Dict[str, str]:
                         "type": "experiments",
                         "attributes": {
                             "name": new_experiment_name,
-                            "description": f"Experiment versioned on {version_suffix} used for {self.experiment.name}",
+                            "description": self.experiment.description,
                             "dataset_id": self.experiment.dataset.datadog_dataset_id,
                             "project_id": project_id,
                             "metadata": {
                                 "tags": self.experiment.tags,
-                                "team": "ml-obs",
+                                **self.experiment.metadata,
                             },
                         },
                     }
@@ -783,6 +749,8 @@ def push(self) -> Dict[str, str]:
                     "duration": float(result["metadata"]["duration"] * 1e9),
                     "tags": self.experiment.tags,
                     "status": "ok",
+                    "metrics": { # TODO: Fill in with actual metrics once we have tracing and llm spans
+                    },
                     "meta": {
                         "span": {"kind": "experiment"},
                         "input": self.experiment.dataset[idx]["input"],
@@ -802,10 +770,11 @@ def push(self) -> Dict[str, str]:
                 # Add evaluation metrics
                 for metric_name, metric_value in result["evaluations"].items():
                     timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
-
+                    
+                    # Check for bool first, since bool is a subclass of int
                     if isinstance(metric_value, bool):
-                        metric_value = 1 if metric_value else 0
-                        metric_type = "score"
+                        metric_type = "categorical"
+                        metric_value = str(metric_value).lower() 
                     elif isinstance(metric_value, (int, float)):
                         metric_type = "score"
                     else:
@@ -817,12 +786,16 @@ def push(self) -> Dict[str, str]:
                         "metric_type": metric_type,
                         "timestamp_ms": timestamp_ms,
                         "label": metric_name,
-                        "score_value"
-                        if metric_type == "score"
-                        else "categorical_value": metric_value,
                     }
+
+                    if metric_type == "score":
+                        metric["score_value"] = metric_value
+                    else:
+                        metric["categorical_value"] = metric_value
+
                     metrics.append(metric)
 
+            print(metrics)
             results_payload = {
                 "data": {
                     "type": "experiments",
@@ -830,6 +803,8 @@ def push(self) -> Dict[str, str]:
                 }
             }
 
+            
+
             url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
             _make_request(
                 conn,
@@ -853,57 +828,6 @@ def push(self) -> Dict[str, str]:
             conn.close()
 
 
-def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
-    """Decorator that creates multiple versions of a function with different parameter combinations.
-
-    Creates multiple versions of a function by generating all possible combinations
-    of the provided parameters. Each generated function variant includes tags
-    indicating its parameter values.
-
-    Args:
-        **param_dict: Dictionary of parameter names and their possible values.
-                     Values can be single items or lists of possible values.
-
-    Returns:
-        Callable: Decorator function that generates parameterized versions of the input function
-
-    Example:
-        @parametrize(model=["gpt-3", "gpt-4"], temperature=[0.0, 0.7])
-        def my_function(text, model, temperature):
-            # This will create 4 versions of the function with different combinations
-            # of model and temperature parameters
-            pass
-    """
-
-    def decorator(func):
-        # Generate all combinations of parameters
-        param_names = list(param_dict.keys())
-        param_values = [
-            param_dict[name]
-            if isinstance(param_dict[name], (list, tuple))
-            else [param_dict[name]]
-            for name in param_names
-        ]
-        param_combinations = [
-            dict(zip(param_names, combo)) for combo in itertools.product(*param_values)
-        ]
-
-        # Create a new function for each parameter combination
-        def create_parameterized_func(params):
-            def wrapped_func(*args, **kwargs):
-                return func(*args, **{**kwargs, **params})
-
-            param_str = "-".join(f"{k}={v}" for k, v in params.items())
-            wrapped_func.__name__ = f"{func.__name__}_{param_str}"
-            wrapped_func.tags = [f"{k}:{v}" for k, v in params.items()]
-            return wrapped_func
-
-        return [create_parameterized_func(combo) for combo in param_combinations]
-
-    return decorator
-
-
-
 def _make_request(
     conn: HTTPSConnection,
     headers: Dict[str, Any],
@@ -930,7 +854,7 @@ def _make_request(
     response_text = response_body.decode('utf-8')
 
     if response.status >= 400:
-        error_message = f"HTTP {response.status} Error during {context}: {response.reason}"
+        error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}"
         raise DatadogAPIError(error_message, status_code=response.status, response=response_text)
 
     if not response_body:
@@ -984,3 +908,84 @@ def _validate_api_keys() -> None:
         )
 
 
+
+def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
+    """Decorator that creates multiple versions by combining all parameter values.
+    
+    Args:
+        **param_dict: Dictionary of parameter names and their possible values.
+                     Values can be single items or lists of possible values.
+
+    Returns:
+        List[Any]: List of results from calling the decorated function with each parameter combination
+    """
+    def decorator(func):
+        # Convert single values to lists
+        processed_params = {
+            name: [val] if not isinstance(val, (list, tuple)) else val
+            for name, val in param_dict.items()
+        }
+
+        # Generate all combinations of parameters
+        param_names = list(processed_params.keys())
+        param_values = [processed_params[name] for name in param_names]
+        param_combinations = [
+            dict(zip(param_names, combo)) 
+            for combo in itertools.product(*param_values)
+        ]
+
+        # Return list of results from calling function with each combination
+        return [func(**params) for params in param_combinations]
+
+    return decorator
+
+class Prompt:
+    """A class for rendering templated prompts with variables.
+
+    Supports both simple string templates and structured chat-like templates.
+
+    Attributes:
+        template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries
+        variables (dict): Default variables to use when rendering the template
+    """
+
+    def __init__(self, template, variables=None):
+        """Initialize a new Prompt.
+
+        Args:
+            template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries
+            variables (dict, optional): Default variables to use when rendering the template. Defaults to {}.
+        """
+        self.template = template
+        self.variables = variables or {}
+
+    def render(self, **kwargs):
+        """Render the template with provided variables.
+
+        Args:
+            **kwargs: Additional variables to use when rendering the template.
+                     These override any default variables with the same name.
+
+        Returns:
+            Union[str, List[Dict[str, str]]]: The rendered template with all variables substituted
+        """
+        merged_vars = {**self.variables, **kwargs}
+
+        if isinstance(self.template, str):
+            return self.template.format(**merged_vars)
+        elif isinstance(self.template, (list, tuple)):
+            return [
+                {
+                    k: v.format(**merged_vars) if isinstance(v, str) else v
+                    for k, v in message.items()
+                }
+                for message in self.template
+            ]
+        else:
+            raise ValueError("Template must be either a string or a list of message dictionaries")
+        
+        
+        
+    def __repr__(self):
+        hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8]
+        return f"Prompt(hash={hash})"
\ No newline at end of file

From 173d2aea055193c7e061223ad3a6735005e4b0a6 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 5 Nov 2024 11:20:17 -0500
Subject: [PATCH 12/36] Add test file, export the top level classes

---
 ddtrace/llmobs/__init__.py              |   5 +-
 ddtrace/llmobs/_experiments.py          | 164 +++++++-----------------
 tests/llmobs/test_llmobs_experiments.py |  51 ++++++++
 3 files changed, 103 insertions(+), 117 deletions(-)
 create mode 100644 tests/llmobs/test_llmobs_experiments.py

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index 11100d3ed66..72596f2418e 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -7,6 +7,9 @@
 """
 
 from ._llmobs import LLMObs
+from ._experiments import Dataset
+from ._experiments import Experiment
+from ._experiments import ExperimentResults
 
 
-__all__ = ["LLMObs"]
+__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index ed4f23ff363..156fab1d8cc 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -8,9 +8,9 @@
 import time
 from urllib.parse import quote
 import concurrent.futures
-import itertools
 import uuid
 
+
 # Constants
 BASE_URL = "api.datadoghq.com"
 
@@ -28,25 +28,23 @@ class Dataset:
         datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed)
     """
 
-    def __init__(
-        self, name: str, data: List[Dict[str, Any]], description: str = ""
-    ) -> None:
+    def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
         self.name = name
         self._validate_data(data)
-        self.data = data
+        self._data = data
         self.description = description
 
         # Post-push attributes
         self.datadog_dataset_id = None
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
-        return iter(self.data)
+        return iter(self._data)
 
     def __len__(self) -> int:
-        return len(self.data)
+        return len(self._data)
 
     def __getitem__(self, index: int) -> Dict[str, Any]:
-        return self.data[index]
+        return self._data[index]
 
     def __repr__(self) -> str:
         header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n"
@@ -80,13 +78,11 @@ def format_entries(entries):
                 formatted_rows.append(separator)
             return "\n".join(formatted_rows)
 
-        if len(self.data) <= 4:
-            entries = format_entries(enumerate(self.data))
+        if len(self._data) <= 4:
+            entries = format_entries(enumerate(self._data))
         else:
-            first_two = format_entries(enumerate(self.data[:2]))
-            last_two = format_entries(
-                enumerate(self.data[-2:], start=len(self.data) - 2)
-            )
+            first_two = format_entries(enumerate(self._data[:2]))
+            last_two = format_entries(enumerate(self._data[-2:], start=len(self._data) - 2))
             entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
 
         table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}"
@@ -119,12 +115,8 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None:
 
             # Check that 'input' and 'expected_output' are flat dictionaries
             for key in ["input", "expected_output"]:
-                if key in row and any(
-                    isinstance(value, dict) for value in row[key].values()
-                ):
-                    raise ValueError(
-                        f"'{key}' must be a flat dictionary (no nested dictionaries)."
-                    )
+                if key in row and any(isinstance(value, dict) for value in row[key].values()):
+                    raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).")
 
     @classmethod
     def from_datadog(cls, name: str) -> "Dataset":
@@ -152,9 +144,7 @@ def from_datadog(cls, name: str) -> "Dataset":
             # Get dataset ID
             encoded_name = quote(name)
             url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(
-                conn, headers, "GET", url, context="Dataset lookup"
-            )
+            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
             datasets = response_data.get("data", [])
 
             if not datasets:
@@ -164,9 +154,7 @@ def from_datadog(cls, name: str) -> "Dataset":
 
             # Get dataset records
             url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-            records_data = _make_request(
-                conn, headers, "GET", url, context="Records lookup"
-            )
+            records_data = _make_request(conn, headers, "GET", url, context="Records lookup")
 
             # Transform records into the expected format
             class_records = []
@@ -209,9 +197,7 @@ def push(self) -> Dict[str, str]:
             # Check if dataset exists
             encoded_name = quote(self.name)
             url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(
-                conn, headers, "GET", url, context="Dataset lookup"
-            )
+            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
             datasets = response_data.get("data", [])
 
             if not datasets:
@@ -244,9 +230,7 @@ def push(self) -> Dict[str, str]:
                 )
 
             # Add records to the dataset
-            records_payload = {
-                "data": {"type": "datasets", "attributes": {"records": self.data}}
-            }
+            records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}}
             url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
             _make_request(
                 conn,
@@ -259,7 +243,7 @@ def push(self) -> Dict[str, str]:
 
             print(f"✓ Successfully uploaded dataset '{self.name}'")
             print(f"  • Dataset ID: {dataset_id}")
-            print(f"  • Records uploaded: {len(self.data)}")
+            print(f"  • Records uploaded: {len(self._data)}")
 
             return self
 
@@ -318,9 +302,7 @@ def format_evaluator(evaluator: Callable) -> str:
             return f"{evaluator.__name__}"
 
         evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators]
-        evaluators = (
-            ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
-        )
+        evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
 
         table = (
             f"{separator}\n"
@@ -349,9 +331,7 @@ def _validate_tags(self) -> None:
         """
         for tag in self.tags:
             if not isinstance(tag, str) or ":" not in tag:
-                raise ValueError(
-                    f"Invalid tag format: {tag}. Tags should be in the format 'key:value'."
-                )
+                raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.")
 
     def run(self, _jobs: int = 10) -> None:
         """Execute the experiment tasks on the dataset without performing evaluations.
@@ -410,10 +390,7 @@ def process_row(idx_row):
                 }
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            future_to_idx = {
-                executor.submit(process_row, (idx, row)): idx
-                for idx, row in enumerate(self.dataset)
-            }
+            future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
 
             # Process as they complete while maintaining order
             completed = 0
@@ -427,9 +404,7 @@ def process_row(idx_row):
                 progress = int(50 * completed / total_rows)
                 bar = f"{'=' * progress}{' ' * (50 - progress)}"
                 percent = int(100 * completed / total_rows)
-                sys.stdout.write(
-                    f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})"
-                )
+                sys.stdout.write(f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})")
                 sys.stdout.flush()
 
             self.outputs = outputs_buffer
@@ -471,10 +446,7 @@ def evaluate_output(idx_output):
                 idx_in_dataset = output_data["metadata"]["dataset_record_idx"]
                 row = self.dataset[idx_in_dataset]
                 output = output_data["output"]
-                evaluations = {
-                    evaluator.__name__: evaluator(row, output)
-                    for evaluator in self.evaluators
-                }
+                evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
 
                 result = {
                     "output": output,
@@ -511,6 +483,13 @@ def evaluate_output(idx_output):
                 results_buffer[idx] = future.result()["result"]
                 completed += 1
 
+                # Update progress
+                progress = int(50 * completed / total_rows)
+                bar = f"{'=' * progress}{' ' * (50 - progress)}"
+                percent = int(100 * completed / total_rows)
+                sys.stdout.write(f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})")
+                sys.stdout.flush()
+
             results.experiment_rows = results_buffer
 
         sys.stdout.write("\n")
@@ -519,7 +498,7 @@ def evaluate_output(idx_output):
         self.results = results
         return results
 
-    def get_results(self) -> 'ExperimentResults':
+    def get_results(self) -> "ExperimentResults":
         if not self.has_evaluated:
             raise ValueError("Evaluations have not been performed yet. Please call eval() after run().")
         return self.results
@@ -563,9 +542,7 @@ def format_entries(entries):
                 dataset_idx = entry["metadata"]["dataset_record_idx"]
                 dataset_entry = self.dataset[dataset_idx]
                 input_lines = format_dict(dataset_entry["input"])
-                expected_output_lines = format_dict(
-                    dataset_entry.get("expected_output", {})
-                )
+                expected_output_lines = format_dict(dataset_entry.get("expected_output", {}))
                 output_lines = format_dict(entry["output"])
                 evaluations_lines = format_dict(entry.get("evaluations", []))
 
@@ -638,13 +615,12 @@ def push(self) -> Dict[str, str]:
         try:
             # Check if project exists
             url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
-            response_data = _make_request(
-                conn, headers, "GET", url, context="Project lookup"
-            )
+            response_data = _make_request(conn, headers, "GET", url, context="Project lookup")
             projects = response_data.get("data", [])
 
             if not projects:
                 # Create new project
+                print(f"Project '{self.experiment.project_name}' not found. Creating it.")
                 project_payload = {
                     "data": {
                         "type": "projects",
@@ -670,9 +646,7 @@ def push(self) -> Dict[str, str]:
             # Check if experiment exists
             encoded_name = quote(self.experiment.name)
             url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
-            response_data = _make_request(
-                conn, headers, "GET", url, context="Experiment lookup"
-            )
+            response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup")
             experiments = response_data.get("data", [])
 
             if not experiments:
@@ -705,9 +679,7 @@ def push(self) -> Dict[str, str]:
                 # Experiment exists, create a new version
                 version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
                 new_experiment_name = f"{self.experiment.name}-{version_suffix}"
-                print(
-                    f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'."
-                )
+                print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.")
                 experiment_payload = {
                     "data": {
                         "type": "experiments",
@@ -738,7 +710,6 @@ def push(self) -> Dict[str, str]:
             metrics = []
 
             for idx, result in enumerate(self.experiment_rows):
-
                 span = {
                     "span_id": _make_id(),
                     "project_id": project_id,
@@ -749,15 +720,12 @@ def push(self) -> Dict[str, str]:
                     "duration": float(result["metadata"]["duration"] * 1e9),
                     "tags": self.experiment.tags,
                     "status": "ok",
-                    "metrics": { # TODO: Fill in with actual metrics once we have tracing and llm spans
-                    },
+                    "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
                     "meta": {
                         "span": {"kind": "experiment"},
                         "input": self.experiment.dataset[idx]["input"],
                         "output": result["output"],
-                        "expected_output": self.experiment.dataset[idx].get(
-                            "expected_output", {}
-                        ),
+                        "expected_output": self.experiment.dataset[idx].get("expected_output", {}),
                         "error": {
                             "message": result["error"],
                             "stack": None,
@@ -770,11 +738,11 @@ def push(self) -> Dict[str, str]:
                 # Add evaluation metrics
                 for metric_name, metric_value in result["evaluations"].items():
                     timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
-                    
+
                     # Check for bool first, since bool is a subclass of int
                     if isinstance(metric_value, bool):
                         metric_type = "categorical"
-                        metric_value = str(metric_value).lower() 
+                        metric_value = str(metric_value).lower()
                     elif isinstance(metric_value, (int, float)):
                         metric_type = "score"
                     else:
@@ -786,6 +754,7 @@ def push(self) -> Dict[str, str]:
                         "metric_type": metric_type,
                         "timestamp_ms": timestamp_ms,
                         "label": metric_name,
+                        "score_value" if metric_type == "score" else "categorical_value": metric_value,
                     }
 
                     if metric_type == "score":
@@ -803,8 +772,6 @@ def push(self) -> Dict[str, str]:
                 }
             }
 
-            
-
             url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
             _make_request(
                 conn,
@@ -815,9 +782,7 @@ def push(self) -> Dict[str, str]:
                 context="Publishing results",
             )
 
-            print(
-                f"✓ Successfully uploaded experiment results for '{self.experiment.name}'"
-            )
+            print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'")
             print(f"  • Experiment ID: {experiment_id}")
             print(f"  • Spans uploaded: {len(spans)}")
             print(f"  • Metrics uploaded: {len(metrics)}")
@@ -851,7 +816,7 @@ def _make_request(
 
     response = conn.getresponse()
     response_body = response.read()
-    response_text = response_body.decode('utf-8')
+    response_text = response_body.decode("utf-8")
 
     if response.status >= 400:
         error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}"
@@ -878,13 +843,16 @@ def _make_id() -> str:
 
 class DatadogAPIError(Exception):
     """Raised when there is an error interacting with the Datadog API."""
+
     def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None):
         self.status_code = status_code
         self.response = response
         super().__init__(message)
 
+
 class DatadogResponseError(Exception):
     """Raised when there is an error parsing the response from Datadog."""
+
     def __init__(self, message: str, raw_response: Optional[str] = None):
         self.raw_response = raw_response
         super().__init__(message)
@@ -908,37 +876,6 @@ def _validate_api_keys() -> None:
         )
 
 
-
-def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
-    """Decorator that creates multiple versions by combining all parameter values.
-    
-    Args:
-        **param_dict: Dictionary of parameter names and their possible values.
-                     Values can be single items or lists of possible values.
-
-    Returns:
-        List[Any]: List of results from calling the decorated function with each parameter combination
-    """
-    def decorator(func):
-        # Convert single values to lists
-        processed_params = {
-            name: [val] if not isinstance(val, (list, tuple)) else val
-            for name, val in param_dict.items()
-        }
-
-        # Generate all combinations of parameters
-        param_names = list(processed_params.keys())
-        param_values = [processed_params[name] for name in param_names]
-        param_combinations = [
-            dict(zip(param_names, combo)) 
-            for combo in itertools.product(*param_values)
-        ]
-
-        # Return list of results from calling function with each combination
-        return [func(**params) for params in param_combinations]
-
-    return decorator
-
 class Prompt:
     """A class for rendering templated prompts with variables.
 
@@ -975,17 +912,12 @@ def render(self, **kwargs):
             return self.template.format(**merged_vars)
         elif isinstance(self.template, (list, tuple)):
             return [
-                {
-                    k: v.format(**merged_vars) if isinstance(v, str) else v
-                    for k, v in message.items()
-                }
+                {k: v.format(**merged_vars) if isinstance(v, str) else v for k, v in message.items()}
                 for message in self.template
             ]
         else:
             raise ValueError("Template must be either a string or a list of message dictionaries")
-        
-        
-        
+
     def __repr__(self):
         hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8]
-        return f"Prompt(hash={hash})"
\ No newline at end of file
+        return f"Prompt(hash={hash})"
diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py
new file mode 100644
index 00000000000..f01551afc17
--- /dev/null
+++ b/tests/llmobs/test_llmobs_experiments.py
@@ -0,0 +1,51 @@
+import itertools
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Union
+
+from ddtrace.llmobs import Dataset
+
+
+def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
+    """Decorator that creates multiple versions by combining all parameter values.
+
+    Args:
+        **param_dict: Dictionary of parameter names and their possible values.
+                     Values can be single items or lists of possible values.
+    Returns:
+        List[Any]: List of results from calling the decorated function with each parameter combination
+    """
+
+    def decorator(func):
+        # Convert single values to lists
+        processed_params = {
+            name: [val] if not isinstance(val, (list, tuple)) else val for name, val in param_dict.items()
+        }
+
+        # Generate all combinations of parameters
+        param_names = list(processed_params.keys())
+        param_values = [processed_params[name] for name in param_names]
+        param_combinations = [dict(zip(param_names, combo)) for combo in itertools.product(*param_values)]
+
+        # Return list of results from calling function with each combination
+        return [func(**params) for params in param_combinations]
+
+    return decorator
+
+
+def test_create_dataset():
+    dataset = Dataset(
+        name="geography-dataset",
+        data=[
+            {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}},
+            {"input": {"prompt": "capital of Germany?"}, "expected_output": {"response": "Berlin"}},
+            {"input": {"prompt": "capital of Japan?"}, "expected_output": {"response": "Tokyo"}},
+            {"input": {"prompt": "capital of Canada?"}, "expected_output": {"response": "Ottawa"}},
+            # ... more data entries ...
+        ],
+    )
+
+    assert dataset.name == "geography-dataset"
+    assert dataset[0] == {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}}

From 044d696b63a3d0adaaa01c1a73254b0dd3878142 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 5 Nov 2024 11:36:41 -0500
Subject: [PATCH 13/36] fmt

---
 ddtrace/llmobs/__init__.py     |  2 +-
 ddtrace/llmobs/_experiments.py | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index 72596f2418e..73429c6713d 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -6,10 +6,10 @@
     LLMObs.enable()
 """
 
-from ._llmobs import LLMObs
 from ._experiments import Dataset
 from ._experiments import Experiment
 from ._experiments import ExperimentResults
+from ._llmobs import LLMObs
 
 
 __all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 156fab1d8cc..4b4a3f4cdb1 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,13 +1,19 @@
+import concurrent.futures
 from datetime import datetime
-from http.client import HTTPSConnection
 import hashlib
+from http.client import HTTPSConnection
 import json
 import os
-from typing import Any, Callable, Dict, List, Union, Optional, Iterator
 import sys
 import time
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import Iterator
+from typing import List
+from typing import Optional
+from typing import Union
 from urllib.parse import quote
-import concurrent.futures
 import uuid
 
 

From ac634fa333dfe4da51846db9560593326653af52 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 5 Nov 2024 17:05:46 -0500
Subject: [PATCH 14/36] Simplify http client, remove stdout printing

---
 ddtrace/llmobs/_experiments.py                | 749 +++++-------------
 ddtrace/llmobs/_utils.py                      |  31 +
 .../appsec/iast/fixtures/propagation_path.py  |   3 +-
 .../experiments/test_dataset_pull.yaml        | 136 ++++
 .../experiments/test_dataset_pull_dne.yaml    |  38 +
 tests/llmobs/test_llmobs_experiments.py       |  40 +-
 tests/llmobs/test_utils.py                    |  10 +
 7 files changed, 472 insertions(+), 535 deletions(-)
 create mode 100644 tests/llmobs/cassettes/experiments/test_dataset_pull.yaml
 create mode 100644 tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 4b4a3f4cdb1..f1998f1a8cb 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,7 +1,5 @@
 import concurrent.futures
 from datetime import datetime
-import hashlib
-from http.client import HTTPSConnection
 import json
 import os
 import sys
@@ -12,13 +10,14 @@
 from typing import Iterator
 from typing import List
 from typing import Optional
-from typing import Union
 from urllib.parse import quote
 import uuid
 
+from ._utils import HTTPResponse
+from ._utils import http_request
 
-# Constants
-BASE_URL = "api.datadoghq.com"
+
+BASE_URL = "https://api.datadoghq.com"
 
 
 class Dataset:
@@ -29,19 +28,17 @@ class Dataset:
 
     Attributes:
         name (str): Name of the dataset
-        data (List[Dict[str, Any]]): List of records containing input/output pairs
         description (str): Optional description of the dataset
-        datadog_dataset_id (str): ID assigned by Datadog after pushing (None if not pushed)
     """
 
     def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
         self.name = name
+        self.description = description
         self._validate_data(data)
         self._data = data
-        self.description = description
 
         # Post-push attributes
-        self.datadog_dataset_id = None
+        self._datadog_dataset_id = None
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self._data)
@@ -52,48 +49,6 @@ def __len__(self) -> int:
     def __getitem__(self, index: int) -> Dict[str, Any]:
         return self._data[index]
 
-    def __repr__(self) -> str:
-        header = f"Dataset: {self.name}\nDescription: {self.description}\nLength: {len(self)}\nDatadog ID: {self.datadog_dataset_id}\n"
-        separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+"
-
-        def format_dict(d: Dict[str, Any]) -> List[str]:
-            def truncate(value: str) -> str:
-                return (value[:17] + "...") if len(value) > 20 else value
-
-            return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
-
-        def format_entries(entries):
-            formatted_rows = []
-            for i, entry in entries:
-                input_lines = format_dict(entry["input"])
-                expected_output_lines = format_dict(entry.get("expected_output", {}))
-
-                # Determine the maximum number of lines in input and expected_output
-                max_lines = max(len(input_lines), len(expected_output_lines))
-
-                # Pad the lists to have the same number of lines
-                input_lines += [""] * (max_lines - len(input_lines))
-                expected_output_lines += [""] * (max_lines - len(expected_output_lines))
-
-                for j in range(max_lines):
-                    if j == 0:
-                        index = f"| {i+1:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |"
-                    else:
-                        index = f"| {'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} |"
-                    formatted_rows.append(index)
-                formatted_rows.append(separator)
-            return "\n".join(formatted_rows)
-
-        if len(self._data) <= 4:
-            entries = format_entries(enumerate(self._data))
-        else:
-            first_two = format_entries(enumerate(self._data[:2]))
-            last_two = format_entries(enumerate(self._data[-2:], start=len(self._data) - 2))
-            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
-
-        table = f"{separator}\n| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} |\n{separator}\n{entries}"
-        return f"{header}\n{table if entries else 'No entries available.'}\n\n"
-
     def _validate_data(self, data: List[Dict[str, Any]]) -> None:
         """Validate the format and structure of dataset records.
 
@@ -138,49 +93,39 @@ def from_datadog(cls, name: str) -> "Dataset":
             ValueError: If the dataset is not found
             Exception: If there are HTTP errors during the request
         """
-        _validate_api_keys()
-        conn = HTTPSConnection(BASE_URL)
-        headers = {
-            "DD-API-KEY": os.getenv("DD_API_KEY"),
-            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json",
-        }
+        # Get dataset ID
+        encoded_name = quote(name)
+        url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
+        resp = exp_http_request("GET", url)
+        response_data = resp.json()
+        datasets = response_data.get("data", [])
+
+        if not datasets:
+            raise ValueError(f"Dataset '{name}' not found")
+
+        dataset_id = datasets[0]["id"]
+
+        # Get dataset records
+        url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+        resp = exp_http_request("GET", url)
+        records_data = resp.json()
+
+        # Transform records into the expected format
+        class_records = []
+        for record in records_data.get("data", []):
+            attrs = record.get("attributes", {})
+            class_records.append(
+                {
+                    "input": attrs.get("input", {}),
+                    "expected_output": attrs.get("expected_output", {}),
+                    **attrs.get("metadata", {}),
+                }
+            )
 
-        try:
-            # Get dataset ID
-            encoded_name = quote(name)
-            url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
-            datasets = response_data.get("data", [])
-
-            if not datasets:
-                raise ValueError(f"Dataset '{name}' not found")
-
-            dataset_id = datasets[0]["id"]
-
-            # Get dataset records
-            url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-            records_data = _make_request(conn, headers, "GET", url, context="Records lookup")
-
-            # Transform records into the expected format
-            class_records = []
-            for record in records_data.get("data", []):
-                attrs = record.get("attributes", {})
-                class_records.append(
-                    {
-                        "input": attrs.get("input", {}),
-                        "expected_output": attrs.get("expected_output", {}),
-                        **attrs.get("metadata", {}),
-                    }
-                )
-
-            # Create new dataset instance
-            dataset = cls(name, class_records)
-            dataset.datadog_dataset_id = dataset_id
-            return dataset
-
-        finally:
-            conn.close()
+        # Create new dataset instance
+        dataset = cls(name, class_records)
+        dataset._datadog_dataset_id = dataset_id
+        return dataset
 
     def push(self) -> Dict[str, str]:
         """Push the dataset to Datadog.
@@ -191,70 +136,44 @@ def push(self) -> Dict[str, str]:
                 - dataset_name: The name of the dataset
                 - record_count: Number of records uploaded
         """
-        _validate_api_keys()
-        conn = HTTPSConnection(BASE_URL)
-        headers = {
-            "DD-API-KEY": os.getenv("DD_API_KEY"),
-            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json",
-        }
-
-        try:
-            # Check if dataset exists
-            encoded_name = quote(self.name)
-            url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Dataset lookup")
-            datasets = response_data.get("data", [])
-
-            if not datasets:
-                # Create new dataset
-                dataset_payload = {
-                    "data": {
-                        "type": "datasets",
-                        "attributes": {
-                            "name": self.name,
-                            "description": self.description,
-                            "metadata": {"team": "ml-obs"},
-                        },
-                    }
+        # Check if dataset exists
+        encoded_name = quote(self.name)
+        url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
+        resp = exp_http_request("GET", url)
+        response_data = resp.json()
+        datasets = response_data.get("data", [])
+
+        if not datasets:
+            # Create new dataset
+            dataset_payload = {
+                "data": {
+                    "type": "datasets",
+                    "attributes": {
+                        "name": self.name,
+                        "description": self.description,
+                        "metadata": {"team": "ml-obs"},
+                    },
                 }
-                response_data = _make_request(
-                    conn,
-                    headers,
-                    "POST",
-                    "/api/unstable/llm-obs/v1/datasets",
-                    body=json.dumps(dataset_payload),
-                    context="Dataset creation",
-                )
-                dataset_id = response_data["data"]["id"]
-                self.datadog_dataset_id = dataset_id
-            else:
-                # Dataset exists, raise error
-                raise ValueError(
-                    f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. "
-                    "Please use a different name for your dataset."
-                )
-
-            # Add records to the dataset
-            records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}}
-            url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-            _make_request(
-                conn,
-                headers,
-                "POST",
-                url,
-                body=json.dumps(records_payload),
-                context="Adding records",
+            }
+            resp = exp_http_request(
+                "POST", "/api/unstable/llm-obs/v1/datasets", body=json.dumps(dataset_payload).encode("utf-8")
+            )
+            response_data = resp.json()
+            dataset_id = response_data["data"]["id"]
+            self._datadog_dataset_id = dataset_id
+        else:
+            # Dataset exists, raise error
+            raise ValueError(
+                f"Dataset '{self.name}' already exists. Dataset versioning will be supported in a future release. "
+                "Please use a different name for your dataset."
             )
 
-            print(f"✓ Successfully uploaded dataset '{self.name}'")
-            print(f"  • Dataset ID: {dataset_id}")
-            print(f"  • Records uploaded: {len(self._data)}")
-
-            return self
-
-        finally:
-            conn.close()
+        # Add records to the dataset
+        records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}}
+        url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+        resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8"))
+        data = resp.json()
+        return data
 
 
 class Experiment:
@@ -301,26 +220,6 @@ def __init__(
         self.outputs = []
         self.results = None
 
-    def __repr__(self) -> str:
-        separator = f"+{'-' * 20}+{'-' * 50}+"
-
-        def format_evaluator(evaluator: Callable) -> str:
-            return f"{evaluator.__name__}"
-
-        evaluator_lines = [format_evaluator(evaluator) for evaluator in self.evaluators]
-        evaluators = ", ".join(evaluator_lines) if evaluator_lines else "No evaluators available"
-
-        table = (
-            f"{separator}\n"
-            f"| {'Experiment':<18} | {self.name:<48} |\n"
-            f"{separator}\n"
-            f"| {'Task':<18} | {self.task.__name__:<48} |\n"
-            f"| {'Dataset':<18} | {f'{self.dataset.name} (n={len(self.dataset)})':<48} |\n"
-            f"| {'Evaluators':<18} | {evaluators:<48} |\n"
-            f"{separator}"
-        )
-        return table
-
     def _validate_tasks(self) -> None:
         # TODO: Design and implement this
         pass
@@ -405,18 +304,8 @@ def process_row(idx_row):
                 idx = future_to_idx[future]
                 outputs_buffer[idx] = future.result()
                 completed += 1
-
-                # Update progress
-                progress = int(50 * completed / total_rows)
-                bar = f"{'=' * progress}{' ' * (50 - progress)}"
-                percent = int(100 * completed / total_rows)
-                sys.stdout.write(f"\rRunning {self.name}: [{bar}] {percent}% ({completed}/{total_rows})")
-                sys.stdout.flush()
-
             self.outputs = outputs_buffer
 
-        sys.stdout.write("\n")
-
         self.has_run = True
 
         return self.eval()
@@ -527,69 +416,6 @@ def __init__(self, dataset: Dataset, experiment: Experiment) -> None:
         self.experiment = experiment
         self.experiment_rows = []
 
-    def __repr__(self) -> str:
-        separator = f"+{'-' * 10}+{'-' * 38}+{'-' * 38}+{'-' * 38}+{'-' * 38}+"
-
-        def format_dict(d: Union[Dict[str, Any], List[Any]]) -> List[str]:
-            if isinstance(d, dict):
-
-                def truncate(value: str) -> str:
-                    return (value[:17] + "...") if len(value) > 20 else value
-
-                return [f"{key}: {truncate(str(value))}" for key, value in d.items()]
-            elif isinstance(d, list):
-                return [str(item) for item in d]
-            else:
-                return [str(d)]
-
-        def format_entries(entries):
-            formatted_rows = []
-            for i, entry in enumerate(entries):
-                dataset_idx = entry["metadata"]["dataset_record_idx"]
-                dataset_entry = self.dataset[dataset_idx]
-                input_lines = format_dict(dataset_entry["input"])
-                expected_output_lines = format_dict(dataset_entry.get("expected_output", {}))
-                output_lines = format_dict(entry["output"])
-                evaluations_lines = format_dict(entry.get("evaluations", []))
-
-                # Determine the maximum number of lines across all fields
-                max_lines = max(
-                    len(input_lines),
-                    len(expected_output_lines),
-                    len(output_lines),
-                    len(evaluations_lines),
-                )
-
-                # Pad the lists to have the same number of lines
-                input_lines += [""] * (max_lines - len(input_lines))
-                expected_output_lines += [""] * (max_lines - len(expected_output_lines))
-                output_lines += [""] * (max_lines - len(output_lines))
-                evaluations_lines += [""] * (max_lines - len(evaluations_lines))
-
-                for j in range(max_lines):
-                    if j == 0:
-                        index = f"| {dataset_idx:<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
-                    else:
-                        index = f"|{'':<8} | {input_lines[j]:<38} | {expected_output_lines[j]:<38} | {output_lines[j]:<38} | {evaluations_lines[j]:<38} |"
-                    formatted_rows.append(index)
-                formatted_rows.append(separator)
-            return "\n".join(formatted_rows)
-
-        if len(self.experiment_rows) <= 4:
-            entries = format_entries(self.experiment_rows)
-        else:
-            first_two = format_entries(self.experiment_rows[:2])
-            last_two = format_entries(self.experiment_rows[-2:])
-            entries = f"{first_two}\n| {'...':<8} | {'...':<38} | {'...':<38} | {'...':<38} | {'...':<38} |\n{separator}\n{last_two}"
-
-        table = (
-            f"{separator}\n"
-            f"| {'Index':<8} | {'Input':<38} | {'Expected Output':<38} | {'Output':<38} | {'Evaluations':<38} |\n"
-            f"{separator}\n"
-            f"{entries}"
-        )
-        return f"Experiment Results:\n{table if entries else 'No results available.'}\n\n"
-
     def __iter__(self) -> Iterator[Dict[str, Any]]:
         return iter(self.experiment_rows)
 
@@ -608,234 +434,155 @@ def push(self) -> Dict[str, str]:
                 - experiment_name: The name of the experiment
                 - span_count: Number of spans uploaded
         """
-        _validate_api_keys()
-
-        # Initialize connection and headers
-        conn = HTTPSConnection(BASE_URL)
-        headers = {
-            "DD-API-KEY": os.getenv("DD_API_KEY"),
-            "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
-            "Content-Type": "application/json",
-        }
-
-        try:
-            # Check if project exists
-            url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Project lookup")
-            projects = response_data.get("data", [])
-
-            if not projects:
-                # Create new project
-                print(f"Project '{self.experiment.project_name}' not found. Creating it.")
-                project_payload = {
-                    "data": {
-                        "type": "projects",
-                        "attributes": {
-                            "name": self.experiment.project_name,
-                            "description": "",
-                            "metadata": {"team": "ml-obs"},
-                        },
-                    }
-                }
-                response_data = _make_request(
-                    conn,
-                    headers,
-                    "POST",
-                    "/api/unstable/llm-obs/v1/projects",
-                    body=json.dumps(project_payload),
-                    context="Project creation",
-                )
-                project_id = response_data["data"]["id"]
-            else:
-                project_id = projects[0]["id"]
-
-            # Check if experiment exists
-            encoded_name = quote(self.experiment.name)
-            url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
-            response_data = _make_request(conn, headers, "GET", url, context="Experiment lookup")
-            experiments = response_data.get("data", [])
-
-            if not experiments:
-                # Create new experiment
-                experiment_payload = {
-                    "data": {
-                        "type": "experiments",
-                        "attributes": {
-                            "name": self.experiment.name,
-                            "description": self.experiment.description,
-                            "dataset_id": self.experiment.dataset.datadog_dataset_id,
-                            "project_id": project_id,
-                            "metadata": {
-                                "tags": self.experiment.tags,
-                                **self.experiment.metadata,
-                            },
-                        },
-                    }
-                }
-                response_data = _make_request(
-                    conn,
-                    headers,
-                    "POST",
-                    "/api/unstable/llm-obs/v1/experiments",
-                    body=json.dumps(experiment_payload),
-                    context="Experiment creation",
-                )
-                experiment_id = response_data["data"]["id"]
-            else:
-                # Experiment exists, create a new version
-                version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-                new_experiment_name = f"{self.experiment.name}-{version_suffix}"
-                print(f"Experiment '{self.experiment.name}' found. Creating new version '{new_experiment_name}'.")
-                experiment_payload = {
-                    "data": {
-                        "type": "experiments",
-                        "attributes": {
-                            "name": new_experiment_name,
-                            "description": self.experiment.description,
-                            "dataset_id": self.experiment.dataset.datadog_dataset_id,
-                            "project_id": project_id,
-                            "metadata": {
-                                "tags": self.experiment.tags,
-                                **self.experiment.metadata,
-                            },
-                        },
-                    }
+        # Check if project exists
+        url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
+        resp = exp_http_request("GET", url)
+        response_data = resp.json()
+        projects = response_data.get("data", [])
+        if not projects:
+            # Create new project
+            project_payload = {
+                "data": {
+                    "type": "projects",
+                    "attributes": {
+                        "name": self.experiment.project_name,
+                        "description": "",
+                        "metadata": {"team": "ml-obs"},
+                    },
                 }
-                response_data = _make_request(
-                    conn,
-                    headers,
-                    "POST",
-                    "/api/unstable/llm-obs/v1/experiments",
-                    body=json.dumps(experiment_payload),
-                    context="Experiment version creation",
-                )
-                experiment_id = response_data["data"]["id"]
-                self.experiment.name = new_experiment_name
-
-            spans = []
-            metrics = []
-
-            for idx, result in enumerate(self.experiment_rows):
-                span = {
-                    "span_id": _make_id(),
-                    "project_id": project_id,
-                    "experiment_id": experiment_id,
-                    "dataset_id": self.experiment.dataset.datadog_dataset_id,
-                    "dataset_record_id": _make_id(),
-                    "start_ns": int(result["metadata"]["timestamp"] * 1e9),
-                    "duration": float(result["metadata"]["duration"] * 1e9),
-                    "tags": self.experiment.tags,
-                    "status": "ok",
-                    "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
-                    "meta": {
-                        "span": {"kind": "experiment"},
-                        "input": self.experiment.dataset[idx]["input"],
-                        "output": result["output"],
-                        "expected_output": self.experiment.dataset[idx].get("expected_output", {}),
-                        "error": {
-                            "message": result["error"],
-                            "stack": None,
-                            "type": None,
+            }
+            resp = exp_http_request(
+                "POST",
+                "/api/unstable/llm-obs/v1/projects",
+                body=json.dumps(project_payload).encode("utf-8"),
+            )
+            response_data = resp.json()
+            project_id = response_data["data"]["id"]
+        else:
+            project_id = projects[0]["id"]
+
+        # Check if experiment exists
+        encoded_name = quote(self.experiment.name)
+        url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
+        resp = exp_http_request("GET", url)
+        response_data = resp.json()
+        experiments = response_data.get("data", [])
+
+        if not experiments:
+            # Create new experiment
+            experiment_payload = {
+                "data": {
+                    "type": "experiments",
+                    "attributes": {
+                        "name": self.experiment.name,
+                        "description": self.experiment.description,
+                        "dataset_id": self.experiment.dataset._datadog_dataset_id,
+                        "project_id": project_id,
+                        "metadata": {
+                            "tags": self.experiment.tags,
+                            **self.experiment.metadata,
                         },
                     },
                 }
-                spans.append(span)
-
-                # Add evaluation metrics
-                for metric_name, metric_value in result["evaluations"].items():
-                    timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
-
-                    # Check for bool first, since bool is a subclass of int
-                    if isinstance(metric_value, bool):
-                        metric_type = "categorical"
-                        metric_value = str(metric_value).lower()
-                    elif isinstance(metric_value, (int, float)):
-                        metric_type = "score"
-                    else:
-                        metric_type = "categorical"
-                        metric_value = str(metric_value)
-
-                    metric = {
-                        "span_id": span["span_id"],
-                        "metric_type": metric_type,
-                        "timestamp_ms": timestamp_ms,
-                        "label": metric_name,
-                        "score_value" if metric_type == "score" else "categorical_value": metric_value,
-                    }
-
-                    if metric_type == "score":
-                        metric["score_value"] = metric_value
-                    else:
-                        metric["categorical_value"] = metric_value
-
-                    metrics.append(metric)
-
-            print(metrics)
-            results_payload = {
+            }
+            resp = exp_http_request(
+                "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
+            )
+            response_data = resp.json()
+            experiment_id = response_data["data"]["id"]
+        else:
+            # Experiment exists, create a new version
+            version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+            new_experiment_name = f"{self.experiment.name}-{version_suffix}"
+            experiment_payload = {
                 "data": {
                     "type": "experiments",
-                    "attributes": {"spans": spans, "metrics": metrics},
+                    "attributes": {
+                        "name": new_experiment_name,
+                        "description": self.experiment.description,
+                        "dataset_id": self.experiment.dataset._datadog_dataset_id,
+                        "project_id": project_id,
+                        "metadata": {
+                            "tags": self.experiment.tags,
+                            **self.experiment.metadata,
+                        },
+                    },
                 }
             }
-
-            url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
-            _make_request(
-                conn,
-                headers,
-                "POST",
-                url,
-                body=json.dumps(results_payload),
-                context="Publishing results",
+            resp = exp_http_request(
+                "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
             )
+            response_data = resp.json()
+            experiment_id = response_data["data"]["id"]
+            self.experiment.name = new_experiment_name
+
+        spans = []
+        metrics = []
+        for idx, result in enumerate(self.experiment_rows):
+            span = {
+                "span_id": _make_id(),
+                "project_id": project_id,
+                "experiment_id": experiment_id,
+                "dataset_id": self.experiment.dataset._datadog_dataset_id,
+                "dataset_record_id": _make_id(),
+                "start_ns": int(result["metadata"]["timestamp"] * 1e9),
+                "duration": float(result["metadata"]["duration"] * 1e9),
+                "tags": self.experiment.tags,
+                "status": "ok",
+                "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
+                "meta": {
+                    "span": {"kind": "experiment"},
+                    "input": self.experiment.dataset[idx]["input"],
+                    "output": result["output"],
+                    "expected_output": self.experiment.dataset[idx].get("expected_output", {}),
+                    "error": {
+                        "message": result["error"],
+                        "stack": None,
+                        "type": None,
+                    },
+                },
+            }
+            spans.append(span)
+
+            # Add evaluation metrics
+            for metric_name, metric_value in result["evaluations"].items():
+                timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
+
+                # Check for bool first, since bool is a subclass of int
+                if isinstance(metric_value, bool):
+                    metric_type = "categorical"
+                    metric_value = str(metric_value).lower()
+                elif isinstance(metric_value, (int, float)):
+                    metric_type = "score"
+                else:
+                    metric_type = "categorical"
+                    metric_value = str(metric_value)
+
+                metric = {
+                    "span_id": span["span_id"],
+                    "metric_type": metric_type,
+                    "timestamp_ms": timestamp_ms,
+                    "label": metric_name,
+                    "score_value" if metric_type == "score" else "categorical_value": metric_value,
+                }
 
-            print(f"✓ Successfully uploaded experiment results for '{self.experiment.name}'")
-            print(f"  • Experiment ID: {experiment_id}")
-            print(f"  • Spans uploaded: {len(spans)}")
-            print(f"  • Metrics uploaded: {len(metrics)}")
-
-            return self
-
-        finally:
-            conn.close()
-
-
-def _make_request(
-    conn: HTTPSConnection,
-    headers: Dict[str, Any],
-    method: str,
-    url: str,
-    body: Optional[Any] = None,
-    context: str = "",
-) -> Dict[str, Any]:
-    """Make an HTTP request to the Datadog API.
-
-    Raises:
-        DatadogAPIError: If the request fails or returns an error status
-        DatadogResponseError: If the response contains invalid JSON
-    """
-    if method == "GET":
-        conn.request(method, url, headers=headers)
-    else:
-        if body is not None and isinstance(body, str):
-            body = body.encode("utf-8")
-        conn.request(method, url, body=body, headers=headers)
-
-    response = conn.getresponse()
-    response_body = response.read()
-    response_text = response_body.decode("utf-8")
+                if metric_type == "score":
+                    metric["score_value"] = metric_value
+                else:
+                    metric["categorical_value"] = metric_value
 
-    if response.status >= 400:
-        error_message = f"HTTP {response.status} Error during {context}: {response.reason}\nResponse: {response_text}"
-        raise DatadogAPIError(error_message, status_code=response.status, response=response_text)
+                metrics.append(metric)
 
-    if not response_body:
-        return {}
+        results_payload = {
+            "data": {
+                "type": "experiments",
+                "attributes": {"spans": spans, "metrics": metrics},
+            }
+        }
 
-    try:
-        return json.loads(response_body)
-    except json.JSONDecodeError:
-        error_message = f"Invalid JSON response during {context}. Status: {response.status}"
-        raise DatadogResponseError(error_message, raw_response=response_text)
+        url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
+        exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
+        return self
 
 
 def _make_id() -> str:
@@ -847,29 +594,8 @@ def _make_id() -> str:
     return uuid.uuid4().hex
 
 
-class DatadogAPIError(Exception):
-    """Raised when there is an error interacting with the Datadog API."""
-
-    def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[str] = None):
-        self.status_code = status_code
-        self.response = response
-        super().__init__(message)
-
-
-class DatadogResponseError(Exception):
-    """Raised when there is an error parsing the response from Datadog."""
-
-    def __init__(self, message: str, raw_response: Optional[str] = None):
-        self.raw_response = raw_response
-        super().__init__(message)
-
-
-def _validate_api_keys() -> None:
-    """Validate that required Datadog API keys are set in environment variables.
-
-    Raises:
-        ValueError: If any required API keys are missing from environment variables
-    """
+def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTTPResponse:
+    """Make an HTTP request to the Datadog experiments API."""
     missing_keys = []
     for key in ["DD_API_KEY", "DD_APPLICATION_KEY"]:
         if not os.getenv(key):
@@ -881,49 +607,10 @@ def _validate_api_keys() -> None:
             "Please set these environment variables before pushing to Datadog."
         )
 
-
-class Prompt:
-    """A class for rendering templated prompts with variables.
-
-    Supports both simple string templates and structured chat-like templates.
-
-    Attributes:
-        template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries
-        variables (dict): Default variables to use when rendering the template
-    """
-
-    def __init__(self, template, variables=None):
-        """Initialize a new Prompt.
-
-        Args:
-            template (Union[str, List[Dict[str, str]]]): Either a template string or a list of message dictionaries
-            variables (dict, optional): Default variables to use when rendering the template. Defaults to {}.
-        """
-        self.template = template
-        self.variables = variables or {}
-
-    def render(self, **kwargs):
-        """Render the template with provided variables.
-
-        Args:
-            **kwargs: Additional variables to use when rendering the template.
-                     These override any default variables with the same name.
-
-        Returns:
-            Union[str, List[Dict[str, str]]]: The rendered template with all variables substituted
-        """
-        merged_vars = {**self.variables, **kwargs}
-
-        if isinstance(self.template, str):
-            return self.template.format(**merged_vars)
-        elif isinstance(self.template, (list, tuple)):
-            return [
-                {k: v.format(**merged_vars) if isinstance(v, str) else v for k, v in message.items()}
-                for message in self.template
-            ]
-        else:
-            raise ValueError("Template must be either a string or a list of message dictionaries")
-
-    def __repr__(self):
-        hash = hashlib.md5(str(self.template).encode()).hexdigest()[:8]
-        return f"Prompt(hash={hash})"
+    headers = {
+        "DD-API-KEY": os.getenv("DD_API_KEY"),
+        "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
+        "Content-Type": "application/json",
+    }
+    url = BASE_URL + url
+    return http_request(method, url, headers=headers, body=body)
diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index 7dd17ea94f3..667dcdd7fd5 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -1,7 +1,9 @@
+import http.client
 import json
 from typing import Dict
 from typing import Optional
 from typing import Union
+import urllib.request
 
 import ddtrace
 from ddtrace import Span
@@ -163,3 +165,32 @@ def safe_json(obj):
         return json.dumps(obj, skipkeys=True, default=_unserializable_default_repr)
     except Exception:
         log.error("Failed to serialize object to JSON.", exc_info=True)
+
+
+class HTTPResponse:
+    def __init__(self, resp: http.client.HTTPResponse) -> None:
+        self._resp = resp
+
+    @property
+    def status_code(self) -> int:
+        return self._resp.status
+
+    def json(self) -> dict:
+        """Return the JSON content of the response.
+
+        Note that this method can only be called once as the response content is read and consumed.
+        """
+        data = self._resp.read()
+        print(data)
+        return json.loads(data.decode("utf-8"))
+
+
+def http_request(
+    method: str, url: str, headers: Optional[Dict[str, str]] = None, body: Optional[bytes] = None
+) -> HTTPResponse:
+    # Create the request object
+    req = urllib.request.Request(url, data=body, method=method)
+    if headers:
+        for key, value in headers.items():
+            req.add_header(key, value)
+    return HTTPResponse(urllib.request.urlopen(req))
diff --git a/tests/appsec/iast/fixtures/propagation_path.py b/tests/appsec/iast/fixtures/propagation_path.py
index 7dcaa737995..d645e781e3f 100644
--- a/tests/appsec/iast/fixtures/propagation_path.py
+++ b/tests/appsec/iast/fixtures/propagation_path.py
@@ -2,13 +2,12 @@
 CAVEAT: the line number is important to some IAST tests, be careful to modify this file and update the tests if you
 make some changes
 """
+import _io
 import asyncio
 import os
 import re
 import sys
 
-import _io
-
 
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 
diff --git a/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml b/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml
new file mode 100644
index 00000000000..3c23f1d0eb0
--- /dev/null
+++ b/tests/llmobs/cassettes/experiments/test_dataset_pull.yaml
@@ -0,0 +1,136 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Connection:
+      - close
+      Content-Type:
+      - application/json
+      Host:
+      - api.datadoghq.com
+      User-Agent:
+      - Python-urllib/3.12
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter[name]=meal-calorie-dataset-multilingual-3
+  response:
+    body:
+      string: '{"data":[{"id":"f61953f8-43de-4a99-bcaf-0b145471dcb0","type":"datasets","attributes":{"author":{"id":"8855edca-05df-11ec-bea0-da7ad0900002"},"created_at":"2024-11-05T15:57:29.145601Z","description":"A
+        dataset of meals and their expected calories","metadata":{"team":"ml-obs"},"name":"meal-calorie-dataset-multilingual-3","updated_at":"2024-11-05T15:57:29.222045Z"}}]}'
+    headers:
+      Connection:
+      - close
+      Content-Length:
+      - '367'
+      Content-Type:
+      - application/vnd.api+json
+      Date:
+      - Tue, 05 Nov 2024 21:46:56 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+- request:
+    body: null
+    headers:
+      Connection:
+      - close
+      Content-Type:
+      - application/json
+      Host:
+      - api.datadoghq.com
+      User-Agent:
+      - Python-urllib/3.12
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/f61953f8-43de-4a99-bcaf-0b145471dcb0/records
+  response:
+    body:
+      string: "{\"data\":[{\"id\":\"92ada783-c6c7-42c3-97c7-a64a2341aadd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1100},\"input\":{\"user_input\":\"had
+        a big mac with medium fries and a coke for lunch\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"450af99c-c56e-4b81-a466-eb1ec69b7ad4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":416},\"input\":{\"user_input\":\"breakfast:
+        2 eggs, toast with butter, and black coffee\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"7cd8db38-3532-4fbc-954c-1407814b79e7\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":535},\"input\":{\"user_input\":\"grilled
+        chicken breast with rice and steamed broccoli\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"0db67626-ecb4-4706-b395-79e4f3e4f635\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"turkey
+        sandwich on wheat with lettuce and mayo\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"10adbc55-f928-4a54-b6d7-0ef24a9c4436\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":325},\"input\":{\"user_input\":\"bowl
+        of cheerios with 2% milk and a banana\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"1c930cda-8327-4058-94eb-c52a6926472a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":810},\"input\":{\"user_input\":\"6-inch
+        subway turkey sub with chips and cookie\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f544f506-907c-4ecb-a3da-f61a98000cec\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":720},\"input\":{\"user_input\":\"chipotle
+        bowl with chicken, rice, beans, and guac\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"94dd5c6c-5272-4aeb-89f4-46726c5c9a70\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":640},\"input\":{\"user_input\":\"salmon
+        fillet with quinoa and asparagus\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"c7171013-ef82-4a73-811d-70ab294bebb3\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":520},\"input\":{\"user_input\":\"chicken
+        caesar salad with croutons\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"04cdfe94-3b3c-4047-9532-99db98e02dd8\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":495},\"input\":{\"user_input\":\"peanut
+        butter and jelly sandwich with apple\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"63cd9fe2-9ed3-4ac2-99a0-3953c54a0248\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":2800},\"input\":{\"user_input\":\"omg
+        just demolished a whole pizza by myself \U0001F355\U0001F60B pepperoni + extra
+        cheese\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b66641b9-e5fe-402a-bfe0-161243d68ac3\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":535},\"input\":{\"user_input\":\"post-workout
+        protein shake w banana n pb \U0001F4AA\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"12266e11-d4a7-4349-b144-d7db9fee9bbf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":486},\"input\":{\"user_input\":\"brunch
+        goals! \U0001F60D avocado toast + poached eggs + mimosa #sundayfunday\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"365d6a99-5a87-496a-9a4d-1a5c1e942c86\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1000},\"input\":{\"user_input\":\"living
+        my best life with this massive burrito \U0001F32F\u2728\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f3f1f98a-1e87-4f09-9193-ec48580e21cd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":420},\"input\":{\"user_input\":\"friday
+        night ice cream run! \U0001F366 got a waffle cone w/ 2 scoops\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4067adbd-c3ca-4220-bee8-7c767f8373a4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":327},\"input\":{\"user_input\":\"meal
+        prep sunday done right! \U0001F957 chicken + sweet potato + kale\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"5f332a5f-1805-4ca3-b4dc-edd664b9dcd0\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1400},\"input\":{\"user_input\":\"cant
+        believe i ate this whole bag of doritos \U0001F631 #noregrets\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"59d278f5-2557-4b45-8900-e15b9cb50654\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":450},\"input\":{\"user_input\":\"smoothie
+        bowl szn \U0001F353 topped w granola and coconut #healthy\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b75f753e-7629-4156-b5d7-52567d690c3d\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":925},\"input\":{\"user_input\":\"date
+        night vibes \U0001F35D pasta carbonara + garlic bread + wine\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"8f39141a-c9a0-4089-8ddf-9913f652d636\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"birthday
+        cake for breakfast because yolo \U0001F382 #treatyoself\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"03a99f81-05f2-4708-9ee2-ce3432fdf4d9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":550},\"input\":{\"user_input\":\"sushiz
+        4 lunch - spicy tuna n californa role\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"14135673-a65e-4680-ad67-a90402cc25e4\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":416},\"input\":{\"user_input\":\"brekkie
+        - eggs n tosst w/ coffey\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4feaaad7-560c-4fa6-b432-b02644b2d823\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"choclate
+        milkshak n french friez\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a57c9b1f-b633-4044-a9ca-d7b39015738a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":400},\"input\":{\"user_input\":\"grilld
+        cheez n tomato soop\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"48abb37c-cd6c-411b-8727-1e6255ea0c69\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":1200},\"input\":{\"user_input\":\"chikn
+        alfredo pasta w/ garlic bred\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f7f02f81-8fbe-4b3d-8520-c101f88a3837\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":325},\"input\":{\"user_input\":\"cerel
+        w/ banan n milk 4 brekfast\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"715d71e5-c0ca-466d-825b-9887e64d7dc9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"intermittent
+        fasting break: huge bowl of pasta #carbload\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"16169dc7-42cd-42ef-a9fe-1a09c4cda987\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":300},\"input\":{\"user_input\":\"clean
+        eating day 1: grilled fish and steamed vegetables\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a033fdb7-fea6-47cb-9078-884efa832210\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":950},\"input\":{\"user_input\":\"foodie
+        adventures: trying this amazing wagyu burger \U0001F60D\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"9129656a-fb11-4980-9d27-ec51e6dbc485\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":480},\"input\":{\"user_input\":\"midnight
+        munchies: instant ramen with egg \U0001F35C\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"67f163c5-8fb7-43ec-aeee-d3ef0807e6ef\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"almuerzo:
+        2 tacos de pollo con guacamole y arroz\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"96a4841f-f929-4e00-9a6a-d88b66effa7e\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"had
+        some ph\u1EDF with extra brisket for dinner\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"90717f0d-df1c-4dc0-a547-a4288790a435\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":950},\"input\":{\"user_input\":\"butter
+        chicken with naan and rice \U0001F1EE\U0001F1F3\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"3d39ee90-38c1-4b96-ac43-01f0e488f87b\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"authentic
+        pad thai with shrimp #thaifood\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fb53483f-987e-4798-a02f-d4dde2105a34\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"homemade
+        sushi rolls - california y spicy tuna\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fcd71b82-e4cb-4120-858a-bfb435a950d6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"cena:
+        pasta alla carbonara con pancetta\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"c299c62e-abb4-4d4d-8ad8-3e90d0c7cecd\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":750},\"input\":{\"user_input\":\"dim
+        sum brunch: siu mai, har gow, and char siu bao\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d7ac551e-09ad-4a4b-b444-881c9eabd925\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"kebab
+        plate with hummus and tabouleh\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d15fc64e-a3cb-46c1-905b-25660db665bf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"bibimbap
+        with extra gochujang \U0001F1F0\U0001F1F7\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a77f58e0-6a33-462d-8cd3-a5f4c45c5749\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":900},\"input\":{\"user_input\":\"enchiladas
+        verdes con pollo y frijoles\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"0bfdd8fd-7c32-4c45-a860-70c28d243b60\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u4ECA\u5929\u4E2D\u5348\u5403\u4E86\u53C9\u70E7\u996D\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"b98c669c-246e-47e8-a671-5aab9682bbb6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"almo\xE7o
+        hoje foi uma lasanha bem gorda\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"aae1f1a0-ab23-49fa-91b0-2792be8917cf\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u0623\u0643\u0644\u062A
+        \u0627\u0644\u063A\u062F\u0627\u0621 \u0627\u0644\u064A\u0648\u0645 \u0633\u0645\u0643\u0629
+        \u0645\u0634\u0648\u064A\u0629 \u0628\u0627\u0644\u0644\u062D\u0645\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"999b9c93-91e5-4d7f-b31f-83d3f886b850\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":500},\"input\":{\"user_input\":\"\u0431\u043E\u0440\u0449
+        \u0441 \u0447\u0435\u0441\u043D\u043E\u0447\u043D\u044B\u043C\u0438 \u043F\u0430\u043C\u043F\u0443\u0448\u043A\u0430\u043C\u0438
+        \u043D\u0430 \u043E\u0431\u0435\u0434 \"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"5c15fe8f-cbb6-462e-91ed-f19a7c6fc51b\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\u9EBB\u5A46\u8C46\u8150\u914D\u7C73\u996D\uFF0C\u5F88\u8FA3\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"99a89c03-10ea-4627-be2a-c3c7bf4a0cb8\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":900},\"input\":{\"user_input\":\"feijoada
+        completa com farofa e couve\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"fdaec01d-ac8a-491f-b756-67cca9373a5f\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":850},\"input\":{\"user_input\":\"schnitzel
+        mit kartoffelsalat und bier \"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"e4c36b98-1853-42d6-92ec-c919a75d0b9c\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"stamppot
+        boerenkool met rookworst\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"9bf1dfef-bb33-453c-9938-cbed356f5eb9\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"nasi
+        goreng dengan telur dan satay ayam\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"3ef03ee2-6aee-4efc-b02e-84cc8a1bea29\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\uBD88\uACE0\uAE30
+        with \uAE40\uCE58 and extra \uBC25\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"2db67005-03d5-44ae-b819-9d838249d45a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u30E9\u30FC\u30E1\u30F3\u7279\u76DB\u308A\u3001\u30C1\u30E3\u30FC\u30B7\u30E5\u30FC\u8FFD\u52A0\u3067\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"2e66c288-a920-47a4-91ba-58dba787b608\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"\u0926\u093E\u0932
+        \u092E\u0916\u0928\u0940 \u0914\u0930 \u092C\u091F\u0930 \u0928\u093E\u0928
+        \u0916\u093E\u092F\u093E\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"a5b8d3be-a56e-4f0d-bff3-e515410beff6\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":700},\"input\":{\"user_input\":\"coq
+        au vin avec pur\xE9e de pommes de terre\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"d5d8c6b6-4043-41e1-868e-c6b653f40451\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":800},\"input\":{\"user_input\":\"\u0643\u0633\u0643\u0633
+        \u0628\u0644\u062D\u0645 \u0627\u0644\u0636\u0623\u0646 \u0648\u0627\u0644\u062E\u0636\u0631\u0648\u0627\u062A\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"f6c22335-7eed-40ce-b25f-99a9b8e3bc97\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":650},\"input\":{\"user_input\":\"\u03B3\u03B5\u03BC\u03B9\u03C3\u03C4\u03AC
+        \u03BC\u03B5 \u03C4\u03B6\u03B1\u03C4\u03B6\u03AF\u03BA\u03B9 \u03BA\u03B1\u03B9
+        \u03C6\u03AD\u03C4\u03B1\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"758a5d1f-ac29-429c-bd0d-cbe086e3b43a\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"pierogi
+        ruskie ze \u015Bmietan\u0105\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"4a5d0a55-c7be-46e0-8fd5-64dd84972e2c\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":500},\"input\":{\"user_input\":\"b\xE1nh
+        x\xE8o v\u1EDBi n\u01B0\u1EDBc m\u1EAFm pha\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}},{\"id\":\"cedbb75a-b3ae-4169-ad55-70a24267b97e\",\"type\":\"datasets\",\"attributes\":{\"author\":{\"id\":\"8855edca-05df-11ec-bea0-da7ad0900002\"},\"created_at\":\"2024-11-05T15:57:29.199538Z\",\"dataset_id\":\"f61953f8-43de-4a99-bcaf-0b145471dcb0\",\"expected_output\":{\"calories\":600},\"input\":{\"user_input\":\"\u05DE\u05E6\u05D0
+        \u05E4\u05DC\u05D0\u05E4\u05DC \u05E2\u05DD \u05D7\u05D5\u05DE\u05D5\u05E1
+        \u05D5\u05E1\u05DC\u05D8\"},\"updated_at\":\"2024-11-05T15:57:29.199538Z\"}}]}"
+    headers:
+      Connection:
+      - close
+      Content-Type:
+      - application/vnd.api+json
+      Date:
+      - Tue, 05 Nov 2024 21:46:57 GMT
+      Transfer-Encoding:
+      - chunked
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml b/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml
new file mode 100644
index 00000000000..7cc0e636a2c
--- /dev/null
+++ b/tests/llmobs/cassettes/experiments/test_dataset_pull_dne.yaml
@@ -0,0 +1,38 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Connection:
+      - close
+      Content-Type:
+      - application/json
+      Host:
+      - api.datadoghq.com
+      User-Agent:
+      - Python-urllib/3.12
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter[name]=dataset-does-not-exist
+  response:
+    body:
+      string: '{"data":[]}'
+    headers:
+      Connection:
+      - close
+      Content-Length:
+      - '11'
+      Content-Type:
+      - application/vnd.api+json
+      Date:
+      - Tue, 05 Nov 2024 21:57:02 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py
index f01551afc17..482890fdf4c 100644
--- a/tests/llmobs/test_llmobs_experiments.py
+++ b/tests/llmobs/test_llmobs_experiments.py
@@ -1,4 +1,5 @@
 import itertools
+import os
 from typing import Any
 from typing import Callable
 from typing import Dict
@@ -6,6 +7,28 @@
 from typing import Union
 
 from ddtrace.llmobs import Dataset
+import pytest
+import vcr
+
+
+# Define a function to scrub the headers you want to remove
+def scrub_response_headers(response):
+    # Remove specific headers
+    headers_to_remove = ["content-security-policy"]
+    for header in headers_to_remove:
+        response["headers"].pop(header, None)
+    return response
+
+
+@pytest.fixture
+def experiments_vcr():
+    return vcr.VCR(
+        cassette_library_dir=os.path.join(os.path.dirname(__file__), "cassettes/experiments"),
+        record_mode="once",
+        match_on=["path"],
+        filter_headers=["DD-API-KEY", "DD-APPLICATION-KEY", "Openai-Api-Key", "Authorization"],
+        before_record_response=scrub_response_headers,
+    )
 
 
 def parametrize(**param_dict: Dict[str, Union[Any, List[Any]]]) -> Callable:
@@ -43,9 +66,22 @@ def test_create_dataset():
             {"input": {"prompt": "capital of Germany?"}, "expected_output": {"response": "Berlin"}},
             {"input": {"prompt": "capital of Japan?"}, "expected_output": {"response": "Tokyo"}},
             {"input": {"prompt": "capital of Canada?"}, "expected_output": {"response": "Ottawa"}},
-            # ... more data entries ...
         ],
     )
-
     assert dataset.name == "geography-dataset"
     assert dataset[0] == {"input": {"prompt": "capital of France?"}, "expected_output": {"response": "Paris"}}
+
+
+def test_dataset_pull(experiments_vcr):
+    with experiments_vcr.use_cassette("test_dataset_pull.yaml"):
+        dataset = Dataset.from_datadog("meal-calorie-dataset-multilingual-3")
+    assert len(dataset) > 0
+    assert isinstance(dataset[0], dict)
+    assert "input" in dataset[0]
+    assert "expected_output" in dataset[0]
+
+
+def test_dataset_pull_dne(experiments_vcr):
+    with experiments_vcr.use_cassette("test_dataset_pull_dne.yaml"):
+        with pytest.raises(ValueError):
+            Dataset.from_datadog("dataset-does-not-exist")
diff --git a/tests/llmobs/test_utils.py b/tests/llmobs/test_utils.py
index 09f23926e86..2b7011c1fc4 100644
--- a/tests/llmobs/test_utils.py
+++ b/tests/llmobs/test_utils.py
@@ -1,5 +1,6 @@
 import pytest
 
+from ddtrace.llmobs._utils import http_request
 from ddtrace.llmobs.utils import Documents
 from ddtrace.llmobs.utils import Messages
 
@@ -99,3 +100,12 @@ def test_documents_dictionary_with_incorrect_value_types():
         Documents({"text": "hello", "name": {"key": "value"}})
     with pytest.raises(TypeError):
         Documents([{"text": "hello", "score": "123"}])
+
+
+def test_http_request():
+    response = http_request("GET", "https://httpbin.org/get")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["url"] == "https://httpbin.org/get"
+    assert data["args"] == {}
+    assert data["headers"]["Host"] == "httpbin.org"

From d29f08185397ba1876f65a5593ccb059589146f3 Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 5 Nov 2024 17:09:51 -0500
Subject: [PATCH 15/36] fmt

---
 ddtrace/llmobs/_utils.py                | 1 -
 tests/llmobs/test_llmobs_experiments.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index 667dcdd7fd5..898707998ec 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -181,7 +181,6 @@ def json(self) -> dict:
         Note that this method can only be called once as the response content is read and consumed.
         """
         data = self._resp.read()
-        print(data)
         return json.loads(data.decode("utf-8"))
 
 
diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py
index 482890fdf4c..7d3612e5bd8 100644
--- a/tests/llmobs/test_llmobs_experiments.py
+++ b/tests/llmobs/test_llmobs_experiments.py
@@ -6,10 +6,11 @@
 from typing import List
 from typing import Union
 
-from ddtrace.llmobs import Dataset
 import pytest
 import vcr
 
+from ddtrace.llmobs import Dataset
+
 
 # Define a function to scrub the headers you want to remove
 def scrub_response_headers(response):

From dc119d0894183e30134f28ae4b3eace038de5ddf Mon Sep 17 00:00:00 2001
From: Kyle Verhoog <kyle@verhoog.ca>
Date: Tue, 5 Nov 2024 17:19:52 -0500
Subject: [PATCH 16/36] more stdout cleanup, http status code checking

---
 ddtrace/llmobs/_experiments.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index f1998f1a8cb..8ae14343999 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -378,17 +378,8 @@ def evaluate_output(idx_output):
                 results_buffer[idx] = future.result()["result"]
                 completed += 1
 
-                # Update progress
-                progress = int(50 * completed / total_rows)
-                bar = f"{'=' * progress}{' ' * (50 - progress)}"
-                percent = int(100 * completed / total_rows)
-                sys.stdout.write(f"\rEvaluating {self.name}: [{bar}] {percent}% ({completed}/{total_rows})")
-                sys.stdout.flush()
-
             results.experiment_rows = results_buffer
 
-        sys.stdout.write("\n")
-
         self.has_evaluated = True
         self.results = results
         return results
@@ -425,7 +416,7 @@ def __len__(self) -> int:
     def __getitem__(self, index: int) -> Any:
         return self.experiment_rows[index]
 
-    def push(self) -> Dict[str, str]:
+    def push(self) -> None:
         """Push the experiment results to Datadog.
 
         Returns:
@@ -582,7 +573,6 @@ def push(self) -> Dict[str, str]:
 
         url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
         exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
-        return self
 
 
 def _make_id() -> str:
@@ -613,4 +603,6 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
         "Content-Type": "application/json",
     }
     url = BASE_URL + url
-    return http_request(method, url, headers=headers, body=body)
+    resp = HTTPResponse(http_request(method, url, headers=headers, body=body))
+    if resp.status_code >= 400:
+        raise ValueError(f"Failed to make request, got status code {resp.status_code}.")

From f0182984c2cba36a27fe11ac3e4f404f2ebb086b Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Fri, 8 Nov 2024 15:13:25 -0500
Subject: [PATCH 17/36] Add feedback from sync

---
 ddtrace/llmobs/__init__.py     |   6 +-
 ddtrace/llmobs/_experiments.py | 736 +++++++++++++++++++++++++++------
 ddtrace/llmobs/_utils.py       |  41 +-
 3 files changed, 653 insertions(+), 130 deletions(-)

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index 73429c6713d..bd382219754 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -9,7 +9,11 @@
 from ._experiments import Dataset
 from ._experiments import Experiment
 from ._experiments import ExperimentResults
+from ._experiments import FileType
+from ._experiments import task
+from ._experiments import evaluator
+from ._experiments import ExperimentGrid
 from ._llmobs import LLMObs
 
 
-__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults"]
+__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator", "ExperimentGrid"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 8ae14343999..fe093aec074 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,23 +1,33 @@
+# TODO: Add error handling
+
 import concurrent.futures
 from datetime import datetime
 import json
 import os
-import sys
 import time
-from typing import Any
-from typing import Callable
-from typing import Dict
-from typing import Iterator
-from typing import List
-from typing import Optional
+from typing import Any, Callable, Dict, Iterator, List, Optional
+import inspect
+from functools import wraps
 from urllib.parse import quote
 import uuid
+import csv
+from enum import Enum
+import itertools
+import hashlib
+import threading
 
 from ._utils import HTTPResponse
 from ._utils import http_request
 
 
-BASE_URL = "https://api.datadoghq.com"
+DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
+BASE_URL = f"https://api.{DD_SITE}"
+
+
+class FileType(Enum):
+    CSV = 'csv'
+    PARQUET = 'parquet'
+    JSONL = 'jsonl'
 
 
 class Dataset:
@@ -74,13 +84,23 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None:
             if set(row.keys()) != first_row_keys:
                 raise ValueError("All rows must have the same keys.")
 
+            # Validate that 'input' exists and is a dictionary
+            if 'input' not in row:
+                raise ValueError("Each row must contain an 'input' field")
+            if not isinstance(row['input'], dict):
+                raise ValueError("The 'input' field must be a dictionary")
+
+            # If expected_output exists, validate it's a dictionary
+            if 'expected_output' in row and not isinstance(row['expected_output'], dict):
+                raise ValueError("The 'expected_output' field must be a dictionary")
+
             # Check that 'input' and 'expected_output' are flat dictionaries
             for key in ["input", "expected_output"]:
                 if key in row and any(isinstance(value, dict) for value in row[key].values()):
                     raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).")
 
     @classmethod
-    def from_datadog(cls, name: str) -> "Dataset":
+    def pull(cls, name: str) -> "Dataset":
         """Create a dataset from a dataset hosted in Datadog.
 
         Args:
@@ -127,7 +147,7 @@ def from_datadog(cls, name: str) -> "Dataset":
         dataset._datadog_dataset_id = dataset_id
         return dataset
 
-    def push(self) -> Dict[str, str]:
+    def push(self) -> None:
         """Push the dataset to Datadog.
 
         Returns:
@@ -173,7 +193,235 @@ def push(self) -> Dict[str, str]:
         url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
         resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8"))
         data = resp.json()
-        return data
+
+    @classmethod
+    def from_csv(
+        cls,
+        filepath: str,
+        name: str,
+        description: str = "",
+        delimiter: str = ",",
+        input_columns: List[str] = None,
+        expected_output_columns: List[str] = None,
+        metadata_columns: List[str] = None,
+    ) -> "Dataset":
+        if input_columns is None or expected_output_columns is None:
+            raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
+
+        data = []
+        try:
+            with open(filepath, mode='r', encoding='utf-8') as csvfile:
+                reader = csv.DictReader(csvfile, delimiter=delimiter)
+                rows = list(reader)
+                if not rows:
+                    raise ValueError("CSV file is empty.")
+
+                # Ensure that the specified columns are present
+                header_columns = reader.fieldnames
+                missing_input_columns = [col for col in input_columns if col not in header_columns]
+                missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
+                missing_metadata_columns = []
+                if metadata_columns:
+                    missing_metadata_columns = [col for col in metadata_columns if col not in header_columns]
+
+                if missing_input_columns:
+                    raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
+                if missing_output_columns:
+                    raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
+                if missing_metadata_columns:
+                    raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")
+
+                for row in rows:
+                    input_data = {col: row[col] for col in input_columns}
+                    expected_output_data = {col: row[col] for col in expected_output_columns}
+                    metadata = {}
+                    if metadata_columns:
+                        metadata = {col: row[col] for col in metadata_columns}
+
+                    data.append({
+                        'input': input_data,
+                        'expected_output': expected_output_data,
+                        **metadata,
+                    })
+        except Exception as e:
+            raise Exception(f"Failed to read CSV file: {e}")
+
+        return cls(name=name, data=data, description=description)
+
+    @classmethod
+    def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+        if input_columns is None or expected_output_columns is None:
+            raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
+
+        data = []
+        try:
+            with open(filepath, mode='r', encoding='utf-8') as jsonlfile:
+                for line in jsonlfile:
+                    row = json.loads(line.strip())
+
+                    input_data = {col: row.get(col) for col in input_columns}
+                    expected_output_data = {col: row.get(col) for col in expected_output_columns}
+                    metadata = {}
+                    if metadata_columns:
+                        metadata = {col: row.get(col) for col in metadata_columns}
+
+                    data.append({
+                        'input': input_data,
+                        'expected_output': expected_output_data,
+                        **metadata,
+                    })
+
+                if not data:
+                    raise ValueError("JSONL file is empty.")
+
+        except Exception as e:
+            raise Exception(f"Failed to read JSONL file: {e}")
+
+        return cls(name=name, data=data, description=description)
+
+    @classmethod
+    def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "pandas is required to read parquet files. "
+                "Please install pandas with: pip install pandas"
+            )
+        
+        if input_columns is None or expected_output_columns is None:
+            raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
+
+        data = []
+        try:
+            df = pd.read_parquet(filepath)
+            if df.empty:
+                raise ValueError("Parquet file is empty.")
+
+            # Ensure that the specified columns are present
+            missing_input_columns = [col for col in input_columns if col not in df.columns]
+            missing_output_columns = [col for col in expected_output_columns if col not in df.columns]
+            missing_metadata_columns = []
+            if metadata_columns:
+                missing_metadata_columns = [col for col in metadata_columns if col not in df.columns]
+
+            if missing_input_columns:
+                raise ValueError(f"Input columns not found in DataFrame: {missing_input_columns}")
+            if missing_output_columns:
+                raise ValueError(f"Expected output columns not found in DataFrame: {missing_output_columns}")
+            if missing_metadata_columns:
+                raise ValueError(f"Metadata columns not found in DataFrame: {missing_metadata_columns}")
+
+            for idx, row in df.iterrows():
+                input_data = {col: row[col] for col in input_columns}
+                expected_output_data = {col: row[col] for col in expected_output_columns}
+                metadata = {}
+                if metadata_columns:
+                    metadata = {col: row[col] for col in metadata_columns}
+
+                data.append({
+                    'input': input_data,
+                    'expected_output': expected_output_data,
+                    **metadata,
+                })
+
+        except Exception as e:
+            raise Exception(f"Failed to read Parquet file: {e}")
+
+        return cls(name=name, data=data, description=description)
+
+    @classmethod
+    def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
+        if filetype == FileType.CSV:
+            return cls.from_csv(
+                filepath=path,
+                name=name,
+                description=description,
+                delimiter=delimiter,
+                input_columns=input_columns,
+                expected_output_columns=expected_output_columns,
+                metadata_columns=metadata_columns,
+            )
+        elif filetype == FileType.JSONL:
+            return cls.from_jsonl(
+                filepath=path,
+                name=name,
+                description=description,
+                input_columns=input_columns,
+                expected_output_columns=expected_output_columns,
+                metadata_columns=metadata_columns,
+            )
+        elif filetype == FileType.PARQUET:
+            return cls.from_parquet(
+                filepath=path,
+                name=name,
+                description=description,
+                input_columns=input_columns,
+                expected_output_columns=expected_output_columns,
+                metadata_columns=metadata_columns,
+            )
+        else:
+            raise ValueError(f"Unsupported file type: {filetype}")
+
+    def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
+        """Convert the dataset to a pandas DataFrame.
+
+        Args:
+            multiindex (bool): If True, expand 'input' and 'expected_output' dictionaries into columns with MultiIndex.
+                            If False, keep 'input' and 'expected_output' as columns containing dictionaries.
+
+        Returns:
+            pd.DataFrame: DataFrame representation of the dataset.
+
+        Raises:
+            ImportError: If pandas is not installed.
+        """
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "pandas is required to convert dataset to DataFrame. "
+                "Please install it with `pip install pandas`"
+            )
+
+        if multiindex:
+            # Create a list of flattened dictionaries
+            flattened_data = []
+            for record in self._data:
+                flat_record = {}
+                # Handle 'input' fields
+                for k, v in record.get('input', {}).items():
+                    flat_record[('input', k)] = v
+                # Handle 'expected_output' fields
+                for k, v in record.get('expected_output', {}).items():
+                    flat_record[('expected_output', k)] = v
+                # Handle any other top-level fields
+                for k, v in record.items():
+                    if k not in ['input', 'expected_output']:
+                        flat_record[('metadata', k)] = v
+                flattened_data.append(flat_record)
+
+            df = pd.DataFrame(flattened_data)
+            # Set columns as MultiIndex
+            df.columns = pd.MultiIndex.from_tuples(df.columns)
+            return df
+        else:
+            # Keep 'input' and 'expected_output' as dicts in the DataFrame
+            return pd.DataFrame(self._data)
+
+    def export_to_jsonl(self, file_path):
+        """
+        Exports the dataset to a JSONL file.
+
+        Args:
+            file_path (str): The path to the output JSONL file.
+        """
+        import json
+
+        with open(file_path, 'w') as f:
+            for record in self._data:
+                json_line = json.dumps(record)
+                f.write(json_line + '\n')
 
 
 class Experiment:
@@ -205,6 +453,7 @@ def __init__(
         project_name: str = "-",
         description: str = "",
         metadata: Dict[str, Any] = {},
+        config: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.name = name
         self.task = task
@@ -214,59 +463,51 @@ def __init__(
         self.project_name = project_name
         self.description = description
         self.metadata = metadata
+        self.config = config
+
+        # Enforce that the task function has the @task decorator
+        if not hasattr(self.task, '_is_task'):
+            raise TypeError("Task function must be decorated with @task decorator.")
+
+        # Enforce that all evaluators have the @evaluator decorator
+        for evaluator_func in self.evaluators:
+            if not hasattr(evaluator_func, '_is_evaluator'):
+                raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.")
+
         # Post-run attributes
         self.has_run = False
         self.has_evaluated = False
         self.outputs = []
         self.results = None
 
-    def _validate_tasks(self) -> None:
-        # TODO: Design and implement this
-        pass
-
-    def _validate_evaluators(self) -> None:
-        # TODO: Design and implement this
-        pass
-
-    def _validate_tags(self) -> None:
-        """Validate experiment tags format.
-
-        Raises:
-            ValueError: If any tag doesn't follow the 'key:value' format
-        """
-        for tag in self.tags:
-            if not isinstance(tag, str) or ":" not in tag:
-                raise ValueError(f"Invalid tag format: {tag}. Tags should be in the format 'key:value'.")
-
-    def run(self, _jobs: int = 10) -> None:
-        """Execute the experiment tasks on the dataset without performing evaluations.
-
-        Runs the task function on each dataset record in parallel and stores
-        the outputs and metadata.
-
-        Args:
-            _jobs (int, optional): Number of parallel workers. Defaults to 10.
-                Must be between 1 and 20.
-
-        Raises:
-            ValueError: If _jobs is not between 1 and 20
-        """
+    def run_task(self, _jobs: int = 10) -> None:
+        """Execute the task function on the dataset and store the outputs."""
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
-
         self.outputs = []
         total_rows = len(self.dataset)
+        completed = 0
 
         def process_row(idx_row):
             idx, row = idx_row
             try:
-                # Apply the task function to the row
+                # Extract the input data
+                input_data = row['input']
+                # Apply the task function to the input data with config
                 start_time = time.time()
-                output = self.task(row)
+                if getattr(self.task, '_accepts_config', False):
+                    output = self.task(input_data, self.config)
+                else:
+                    output = self.task(input_data)
                 end_time = time.time()
                 duration = end_time - start_time
 
-                return {
+                # **Ensure output is a dictionary**
+                if not isinstance(output, dict):
+                    output = {'value': output}
+
+                # Prepare output data
+                output_data = {
                     "idx": idx,
                     "output": output,
                     "metadata": {
@@ -279,8 +520,10 @@ def process_row(idx_row):
                     },
                     "error": None,
                 }
+                return output_data
+
             except Exception as e:
-                return {
+                output_data = {
                     "idx": idx,
                     "output": None,
                     "metadata": {
@@ -293,100 +536,87 @@ def process_row(idx_row):
                     },
                     "error": str(e),
                 }
+                return output_data
+
+        # Initialize the progress bar
+        _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
             future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
 
-            # Process as they complete while maintaining order
-            completed = 0
             outputs_buffer = [None] * total_rows
             for future in concurrent.futures.as_completed(future_to_idx):
                 idx = future_to_idx[future]
-                outputs_buffer[idx] = future.result()
+                output_data = future.result()
+                outputs_buffer[idx] = output_data
                 completed += 1
-            self.outputs = outputs_buffer
+                _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
 
+        self.outputs = outputs_buffer
         self.has_run = True
 
-        return self.eval()
-
-    def eval(self, _jobs: int = 10) -> "ExperimentResults":
-        """Evaluate the outputs using the provided evaluators.
-
-        Runs the evaluators on each output in parallel and collects evaluations.
-
-        Args:
-            _jobs (int, optional): Number of parallel workers. Defaults to 10.
-                Must be between 1 and 20.
-
-        Returns:
-            ExperimentResults: Object containing the experiment results
-
-        Raises:
-            ValueError: If _jobs is not between 1 and 20
-            ValueError: If the experiment has not been run yet
-        """
-        if not 1 <= _jobs <= 20:
-            raise ValueError("Number of jobs must be between 1 and 20")
-
+    def run_evaluations(self) -> None:
+        """Run evaluators on the outputs and store the results."""
         if not self.has_run:
-            raise ValueError("Experiment has not been run yet. Please call run() before eval().")
+            raise ValueError("Task has not been run yet. Please call run_task() before run_evaluations().")
 
-        results = ExperimentResults(self.dataset, self)
+        self.results = ExperimentResults(self.dataset, self)
+        results_buffer = []
         total_rows = len(self.outputs)
+        completed = 0
+
+        # Initialize the progress bar
+        _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete')
 
-        def evaluate_output(idx_output):
-            idx, output_data = idx_output
+        for idx, output_data in enumerate(self.outputs):
             try:
-                idx_in_dataset = output_data["metadata"]["dataset_record_idx"]
-                row = self.dataset[idx_in_dataset]
+                # Retrieve output from output_data
                 output = output_data["output"]
-                evaluations = {evaluator.__name__: evaluator(row, output) for evaluator in self.evaluators}
-
+                # Get the corresponding dataset row
+                dataset_row = self.dataset[idx]
+                input_data = dataset_row.get('input', {})
+                expected_output = dataset_row.get('expected_output', {})
+
+                # Perform evaluation
+                evaluations = {}
+                for evaluator in self.evaluators:
+                    evaluation_result = evaluator(expected_output, output, input_data)
+                    evaluations[evaluator.__name__] = evaluation_result
+
+                # Prepare result data
                 result = {
                     "output": output,
                     "evaluations": evaluations,
                     "metadata": output_data["metadata"],
                     "tags": self.tags,
-                    "error": output_data["error"],
+                    "error": None #TODO: Add error handling
                 }
-
-                return {"idx": idx, "result": result}
             except Exception as e:
-                return {
-                    "idx": idx,
-                    "result": {
-                        "output": output_data["output"],
-                        "evaluations": {},
-                        "metadata": output_data["metadata"],
-                        "tags": self.tags,
-                        "error": str(e),
-                    },
+                result = {
+                    "output": output_data.get('output'),
+                    "evaluations": {},
+                    "metadata": output_data["metadata"],
+                    "tags": self.tags,
+                    "error": str(e),
                 }
 
-        with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            future_to_idx = {
-                executor.submit(evaluate_output, (idx, output_data)): idx
-                for idx, output_data in enumerate(self.outputs)
-            }
-
-            # Process as they complete while maintaining order
-            completed = 0
-            results_buffer = [None] * total_rows
-            for future in concurrent.futures.as_completed(future_to_idx):
-                idx = future_to_idx[future]
-                results_buffer[idx] = future.result()["result"]
-                completed += 1
-
-            results.experiment_rows = results_buffer
+            results_buffer.append(result)
+            completed += 1
+            _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete')
 
         self.has_evaluated = True
-        self.results = results
-        return results
+        self.results.experiment_rows = results_buffer
+
+    def run(self, _jobs: int = 10) -> "ExperimentResults":
+        """Execute the task and evaluations, returning the results."""
+        self.run_task(_jobs=_jobs)
+        self.run_evaluations()
+        print()  # Move to the next line after completion
+        return self.results
 
     def get_results(self) -> "ExperimentResults":
         if not self.has_evaluated:
-            raise ValueError("Evaluations have not been performed yet. Please call eval() after run().")
+            raise ValueError("Evaluations have not been performed yet. Please call run() or run_evaluations().")
         return self.results
 
 
@@ -416,15 +646,18 @@ def __len__(self) -> int:
     def __getitem__(self, index: int) -> Any:
         return self.experiment_rows[index]
 
-    def push(self) -> None:
+    def push(self, overwrite: bool = False) -> None:
         """Push the experiment results to Datadog.
 
-        Returns:
-            Dict[str, str]: Dictionary containing experiment information including:
-                - experiment_id: The ID of the created experiment
-                - experiment_name: The name of the experiment
-                - span_count: Number of spans uploaded
+        Raises:
+            ValueError: If the dataset hasn't been pushed to Datadog first
         """
+        if not self.experiment.dataset._datadog_dataset_id:
+            raise ValueError(
+                "Dataset has not been pushed to Datadog. "
+                "Please call dataset.push() before pushing experiment results."
+            )
+
         # Check if project exists
         url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
         resp = exp_http_request("GET", url)
@@ -472,6 +705,7 @@ def push(self) -> None:
                         "metadata": {
                             "tags": self.experiment.tags,
                             **self.experiment.metadata,
+                            "config": self.experiment.config,
                         },
                     },
                 }
@@ -496,6 +730,7 @@ def push(self) -> None:
                         "metadata": {
                             "tags": self.experiment.tags,
                             **self.experiment.metadata,
+                            "config": self.experiment.config,
                         },
                     },
                 }
@@ -574,6 +809,113 @@ def push(self) -> None:
         url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
         exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
 
+    def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
+        """Convert the experiment results to a pandas DataFrame, including the experiment config.
+
+        Args:
+            multiindex (bool): If True, expand nested dictionaries into MultiIndex columns.
+                               If False, keep the nested dictionaries as they are.
+
+        Returns:
+            pd.DataFrame: A DataFrame representation of the experiment results.
+
+        Raises:
+            ImportError: If pandas is not installed.
+        """
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "pandas is required to convert experiment results to DataFrame. "
+                "Please install it with `pip install pandas`"
+            )
+
+        # Collect data
+        data = []
+        for result in self.experiment_rows:
+            record = {}
+            # Get index of the dataset record
+            idx = result['metadata'].get('dataset_record_idx')
+            dataset_record = self.dataset[idx]
+
+            if multiindex:
+
+                # Flatten 'input' and 'expected_output' from the dataset
+                for k, v in dataset_record.get('input', {}).items():
+                    record[('input', k)] = v
+                for k, v in dataset_record.get('expected_output', {}).items():
+                    record[('expected_output', k)] = v
+
+                # Flatten 'output' from the result
+                output = result.get('output', {})
+                if isinstance(output, dict):
+                    for k, v in output.items():
+                        record[('output', k)] = v
+                else:
+                    record[('output', 'value')] = output
+
+                # Flatten 'evaluations' from the result
+                evaluations = result.get('evaluations', {})
+                for evaluator_name, evaluation in evaluations.items():
+                    if isinstance(evaluation, dict):
+                        for k, v in evaluation.items():
+                            record[('evaluations', evaluator_name, k)] = v
+                    else:
+                        record[('evaluations', evaluator_name)] = evaluation
+
+                 # Flatten 'config' from the experiment, if it exists
+                if self.experiment.config:
+                    for k, v in self.experiment.config.items():
+                        record[('config', k)] = v
+
+                # Flatten 'metadata' from the result
+                for k, v in result.get('metadata', {}).items():
+                    # Skip project_name, experiment_name, and dataset_name
+                    if k not in ['project_name', 'experiment_name', 'dataset_name']:
+                        record[('metadata', k)] = v
+
+
+                # Include 'error' if any
+                error = result.get('error')
+                if error:
+                    record[('error', 'message')] = error
+
+            else:
+                # Include config as a dictionary, if it exists
+                if self.experiment.config:
+                    record['config'] = self.experiment.config
+
+                # Keep nested dictionaries
+                record['input'] = dataset_record.get('input', {})
+                record['expected_output'] = dataset_record.get('expected_output', {})
+                record['output'] = result.get('output', {})
+                record['evaluations'] = result.get('evaluations', {})
+                record['metadata'] = result.get('metadata', {})
+                record['tags'] = result.get('tags', [])
+                record['error'] = result.get('error')
+
+            data.append(record)
+
+        df = pd.DataFrame(data)
+        if multiindex:
+            # Set columns as MultiIndex
+            df.columns = pd.MultiIndex.from_tuples(df.columns)
+        return df
+
+    def export_to_jsonl(self, file_path):
+        """
+        Exports the experiment results to a JSONL file.
+
+        Args:
+            file_path (str): The path to the output JSONL file.
+        """
+        import json
+
+        with open(file_path, 'w') as f:
+            for result in self.experiment_rows:
+                json_line = json.dumps(result)
+                f.write(json_line + '\n')
+
 
 def _make_id() -> str:
     """Generate a unique identifier.
@@ -603,6 +945,164 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
         "Content-Type": "application/json",
     }
     url = BASE_URL + url
-    resp = HTTPResponse(http_request(method, url, headers=headers, body=body))
+    resp = http_request(method, url, headers=headers, body=body)
     if resp.status_code >= 400:
-        raise ValueError(f"Failed to make request, got status code {resp.status_code}.")
+        try:
+            error_details = resp.json()
+            error_message = error_details.get('errors', [{}])[0].get('detail', resp.text())
+        except Exception:
+            error_message = resp.text()
+        raise ValueError(f"Request failed with status code {resp.status_code}: {error_message}")
+    return resp
+
+
+def task(func):
+    if func.__name__ == "task":
+        raise ValueError("Function name 'task' is reserved. Please use a different name for your task function.")
+        
+    @wraps(func)
+    def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Any:
+        # Call the original function with or without config
+        if 'config' in inspect.signature(func).parameters:
+            return func(input, config)
+        else:
+            return func(input)
+    # Enforce signature compliance
+    sig = inspect.signature(func)
+    params = sig.parameters
+    if 'input' not in params:
+        raise TypeError("Task function must have an 'input' parameter.")
+    # Set attribute to indicate whether the function accepts config
+    wrapper._accepts_config = 'config' in params
+    wrapper._is_task = True  # Set attribute to indicate decoration
+    return wrapper
+
+
+def evaluator(func):
+    @wraps(func)
+    def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any] = None) -> Any:
+            return func(expected_output, output, input)
+    # Enforce signature compliance
+    sig = inspect.signature(func)
+    params = sig.parameters
+    required_params = ['expected_output', 'output', 'input']
+    if not all(param in params for param in required_params):
+        raise TypeError(f"Evaluator function must have parameters {required_params}.")
+    wrapper._is_evaluator = True  # Set attribute to indicate decoration
+    return wrapper
+
+
+class ExperimentGrid:
+    """Class to run a grid of experiments over multiple parameter combinations.
+
+    Attributes:
+        name (str): Name of the experiment grid.
+        task (Callable): The task function to execute.
+        dataset (Dataset): The dataset to use.
+        evaluators (List[Callable]): List of evaluator functions.
+        config (Dict[str, List[Any]]): Parameter grid to run over.
+        tags (List[str]): List of tags.
+        project_name (str): Name of the project.
+        description (str): Description of the experiment grid.
+        metadata (Dict[str, Any]): Metadata dictionary.
+        experiments (List[Experiment]): List of experiments created.
+        results (List[ExperimentResults]): List of corresponding results.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        task: Callable,
+        dataset: Dataset,
+        evaluators: List[Callable],
+        config: Dict[str, List[Any]],
+        tags: List[str] = [],
+        project_name: str = "-",
+        description: str = "",
+        metadata: Dict[str, Any] = {},
+    ) -> None:
+        self.name = name
+        self.task = task
+        self.dataset = dataset
+        self.evaluators = evaluators
+        self.config = config
+        self.tags = tags
+        self.project_name = project_name
+        self.description = description
+        self.metadata = metadata
+        self.experiments = []
+        self.results = []
+
+        # Generate all parameter combinations and create experiments
+        self._generate_experiments()
+
+    def _generate_experiments(self):
+        keys, values = zip(*self.config.items())
+        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
+
+        for params in param_combinations:
+            # Create config for the experiment
+            config = params.copy()
+
+            # Compute hash of the config
+            config_str = json.dumps(config, sort_keys=True)
+            config_hash = hashlib.md5(config_str.encode('utf-8')).hexdigest()
+            config_hash_tag = f"config_hash:{config_hash}"
+
+            # Generate a unique name for each experiment
+            experiment_name = f"{self.name}_" + "_".join(f"{k}_{v}" for k, v in params.items())
+
+            # Create tags for parameters
+            param_tags = [f"{k}:{v}" for k, v in params.items()] + [config_hash_tag]
+
+            # Create a new experiment instance with updated config and name
+            experiment = Experiment(
+                name=experiment_name,
+                task=self.task,
+                dataset=self.dataset,
+                evaluators=self.evaluators,
+                tags=self.tags + param_tags,
+                project_name=self.project_name,
+                description=self.description,
+                metadata={**self.metadata, "config": config},
+                config=config,
+            )
+
+            # Add the experiment to the list without running it
+            self.experiments.append(experiment)
+
+    def __len__(self):
+        return len(self.experiments)
+
+    def __getitem__(self, index):
+        return self.experiments[index]
+
+    # Update the run method to use the pre-generated experiments
+    def run(self, _jobs: int = 10):
+        """Run experiments for all combinations of parameters in the grid.
+
+        Args:
+            _jobs (int): Number of parallel workers for each experiment run.
+        """
+        for experiment in self.experiments:
+            experiment.run(_jobs=_jobs)
+            self.results.append(experiment.get_results())
+
+        return self.results
+
+    def get_all_results(self) -> List[ExperimentResults]:
+        """Return all results from the experiment grid.
+
+        Returns:
+            List[ExperimentResults]: A list of results for each experiment.
+        """
+        return self.results
+
+
+def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'):
+    percent = f"{100 * (iteration / float(total)):.{decimals}f}"
+    filled_length = int(length * iteration // total)
+    bar = fill * filled_length + '-' * (length - filled_length)
+    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')
+    if iteration == total:
+        print()
diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index 898707998ec..8907ebd6265 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -4,6 +4,7 @@
 from typing import Optional
 from typing import Union
 import urllib.request
+from urllib.error import HTTPError
 
 import ddtrace
 from ddtrace import Span
@@ -168,28 +169,46 @@ def safe_json(obj):
 
 
 class HTTPResponse:
-    def __init__(self, resp: http.client.HTTPResponse) -> None:
+    def __init__(self, resp) -> None:
+        if resp is None:
+            raise ValueError("Response object cannot be None")
         self._resp = resp
+        self._content = None  # Cache the content
 
     @property
     def status_code(self) -> int:
-        return self._resp.status
+        if hasattr(self._resp, 'status'):
+            return self._resp.status
+        elif hasattr(self._resp, 'code'):
+            return self._resp.code
+        elif hasattr(self._resp, 'getcode'):
+            return self._resp.getcode()
+        else:
+            raise AttributeError(f"Could not find status code in response object of type {type(self._resp)}")
+
+    def read(self) -> bytes:
+        if self._content is None:
+            self._content = self._resp.read()
+        return self._content
+
+    def text(self) -> str:
+        return self.read().decode('utf-8')
 
     def json(self) -> dict:
-        """Return the JSON content of the response.
-
-        Note that this method can only be called once as the response content is read and consumed.
-        """
-        data = self._resp.read()
-        return json.loads(data.decode("utf-8"))
+        return json.loads(self.text())
 
 
 def http_request(
     method: str, url: str, headers: Optional[Dict[str, str]] = None, body: Optional[bytes] = None
 ) -> HTTPResponse:
+    """Make an HTTP request and return an HTTPResponse object."""
     # Create the request object
     req = urllib.request.Request(url, data=body, method=method)
     if headers:
-        for key, value in headers.items():
-            req.add_header(key, value)
-    return HTTPResponse(urllib.request.urlopen(req))
+        req.headers.update(headers)
+    try:
+        response = urllib.request.urlopen(req)
+        return HTTPResponse(response)
+    except HTTPError as e:
+        # Create an HTTPResponse object from the error response
+        return HTTPResponse(e)

From 351cd7a49b45bd59657bbd0205691d41ea2571a2 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Sun, 10 Nov 2024 17:50:40 -0500
Subject: [PATCH 18/36] Add error handling on tasks

---
 ddtrace/llmobs/_experiments.py | 624 +++++++++++++++++++++------------
 1 file changed, 402 insertions(+), 222 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index fe093aec074..9c6f3f3b3c9 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,4 +1,8 @@
-# TODO: Add error handling
+# TODO: Test failures on badly defined evaluators
+# TODO: Test workflows for re-evals and publishing results
+# TODO: Handle behavior pushing experiment results without dataset
+# TODO: Idempotency of push/pull methods
+# TODO: Support running on subsets of datasets 
 
 import concurrent.futures
 from datetime import datetime
@@ -14,11 +18,11 @@
 from enum import Enum
 import itertools
 import hashlib
-import threading
 
 from ._utils import HTTPResponse
 from ._utils import http_request
 
+import ddtrace
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
 BASE_URL = f"https://api.{DD_SITE}"
@@ -117,6 +121,7 @@ def pull(cls, name: str) -> "Dataset":
         encoded_name = quote(name)
         url = f"/api/unstable/llm-obs/v1/datasets?filter[name]={encoded_name}"
         resp = exp_http_request("GET", url)
+        
         response_data = resp.json()
         datasets = response_data.get("data", [])
 
@@ -194,6 +199,9 @@ def push(self) -> None:
         resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8"))
         data = resp.json()
 
+        # Print url to the dataset in Datadog
+        print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}")
+
     @classmethod
     def from_csv(
         cls,
@@ -437,10 +445,13 @@ class Experiment:
         evaluators (List[Callable]): Functions that evaluate task outputs
         tags (List[str]): Tags for organizing experiments
         project_name (str): Name of the project this experiment belongs to
+        description (str): Description of the experiment
+        metadata (Dict[str, Any]): Additional metadata for the experiment
+        config (Optional[Dict[str, Any]]): Configuration for the task
         has_run (bool): Whether the experiment has been executed
         has_evaluated (bool): Whether the evaluations have been performed
         outputs (List[Dict]): Outputs after running the task
-        results (ExperimentResults): Results after running evaluations
+        evaluations (List[Dict]): Evaluation results after running evaluators
     """
 
     def __init__(
@@ -478,90 +489,228 @@ def __init__(
         self.has_run = False
         self.has_evaluated = False
         self.outputs = []
-        self.results = None
+        self.evaluations = []
 
-    def run_task(self, _jobs: int = 10) -> None:
+    def run_task(
+        self,
+        _jobs: int = 10,
+        timeout: Optional[float] = None,
+        retries: int = 0,
+        max_delay: float = 60.0,
+        raise_on_error: bool = False,
+    ) -> None:
         """Execute the task function on the dataset and store the outputs."""
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
+        if retries < 0:
+            raise ValueError("Number of retries must be non-negative")
         self.outputs = []
         total_rows = len(self.dataset)
         completed = 0
 
         def process_row(idx_row):
             idx, row = idx_row
-            try:
-                # Extract the input data
-                input_data = row['input']
-                # Apply the task function to the input data with config
-                start_time = time.time()
-                if getattr(self.task, '_accepts_config', False):
-                    output = self.task(input_data, self.config)
-                else:
-                    output = self.task(input_data)
-                end_time = time.time()
-                duration = end_time - start_time
-
-                # **Ensure output is a dictionary**
-                if not isinstance(output, dict):
-                    output = {'value': output}
-
-                # Prepare output data
-                output_data = {
-                    "idx": idx,
-                    "output": output,
-                    "metadata": {
-                        "timestamp": start_time,
-                        "duration": duration,
-                        "dataset_record_idx": idx,
-                        "project_name": self.project_name,
-                        "experiment_name": self.name,
-                        "dataset_name": self.dataset.name,
-                    },
-                    "error": None,
-                }
-                return output_data
-
-            except Exception as e:
-                output_data = {
-                    "idx": idx,
-                    "output": None,
-                    "metadata": {
-                        "timestamp": time.time(),
-                        "duration": 0,
-                        "dataset_record_idx": idx,
-                        "project_name": self.project_name,
-                        "experiment_name": self.name,
-                        "dataset_name": self.dataset.name,
-                    },
-                    "error": str(e),
-                }
-                return output_data
+            attempt = 0
+            delay = 1.0  # Initial delay in seconds
+
+            while attempt <= retries:
+                try:
+                    # Extract the input data
+                    input_data = row['input']
+                    start_time = time.time()
+
+                    def execute_task():
+                        if getattr(self.task, '_accepts_config', False):
+                            return self.task(input_data, self.config)
+                        else:
+                            return self.task(input_data)
+
+                    # Use ThreadPoolExecutor to enforce timeout
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor:
+                        future = single_executor.submit(execute_task)
+                        output = future.result(timeout=timeout)
+
+                    end_time = time.time()
+                    duration = end_time - start_time
+
+                    # Ensure output is a dictionary
+                    if not isinstance(output, dict):
+                        output = {'value': output}
+
+                    # Prepare output data
+                    output_data = {
+                        "idx": idx,
+                        "output": output,
+                        "metadata": {
+                            "timestamp": start_time,
+                            "duration": duration,
+                            "dataset_record_idx": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                        },
+                        "error": {
+                            "message": None,
+                            "stack": None,
+                            "type": None,
+                        }
+                    }
+                    return output_data
+
+                except concurrent.futures.TimeoutError as e:
+                    if raise_on_error:
+                        # Reraise the exception to trigger cancellation
+                        raise Exception(f"TimeoutError in task for row {idx}: {e}") from e
+                    if attempt < retries:
+                        # Exponential backoff and retry
+                        sleep_time = min(delay, max_delay)
+                        time.sleep(sleep_time)
+                        delay *= 2
+                        attempt += 1
+                    else:
+                        # All retries exhausted, record the timeout error
+                        output_data = {
+                            "idx": idx,
+                            "output": None,
+                            "metadata": {
+                                "timestamp": time.time(),
+                                "duration": 0,
+                                "dataset_record_idx": idx,
+                                "project_name": self.project_name,
+                                "experiment_name": self.name,
+                                "dataset_name": self.dataset.name,
+                            },
+                            "error": {
+                                "message": "Task timed out",
+                                "stack": None,
+                                "type": "TimeoutError",
+                            }
+                        }
+                        return output_data
+
+                except Exception as e:
+                    if raise_on_error:
+                        # Reraise the exception to trigger cancellation
+                        error_type = type(e).__name__
+                        raise Exception(f"Exception in task for row {idx}: {error_type}: {e}") from e
+                    if attempt < retries:
+                        # Exponential backoff and retry
+                        sleep_time = min(delay, max_delay)
+                        time.sleep(sleep_time)
+                        delay *= 2
+                        attempt += 1
+                    else:
+                        # All retries exhausted, record the error
+                        output_data = {
+                            "idx": idx,
+                            "output": None,
+                            "metadata": {
+                                "timestamp": time.time(),
+                                "duration": 0,
+                                "dataset_record_idx": idx,
+                                "project_name": self.project_name,
+                                "experiment_name": self.name,
+                                "dataset_name": self.dataset.name,
+                            },
+                            "error": {
+                                "message": str(e),
+                                "stack": None,
+                                "type": type(e).__name__,
+                            }
+                        }
+                        return output_data
 
         # Initialize the progress bar
         _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
 
+        # Use a flag to determine if an error occurred
+        error_occurred = False
+        error_exception = None
+
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            future_to_idx = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
+            # Submit the process_row function to the executor for each dataset record
+            futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
 
             outputs_buffer = [None] * total_rows
-            for future in concurrent.futures.as_completed(future_to_idx):
-                idx = future_to_idx[future]
-                output_data = future.result()
-                outputs_buffer[idx] = output_data
-                completed += 1
-                _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
+            try:
+                for future in concurrent.futures.as_completed(futures):
+                    idx = futures[future]
+                    try:
+                        output_data = future.result()
+                        outputs_buffer[idx] = output_data
+                        if raise_on_error and output_data['error']['message']:
+                            # An error occurred; cancel all futures
+                            error_occurred = True
+                            error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}")
+                            break
+                    except Exception as e:
+                        outputs_buffer[idx] = {
+                            "idx": idx,
+                            "output": None,
+                            "metadata": {
+                                "timestamp": time.time(),
+                                "duration": 0,
+                                "dataset_record_idx": idx,
+                                "project_name": self.project_name,
+                                "experiment_name": self.name,
+                                "dataset_name": self.dataset.name,
+                            },
+                            "error": {
+                                "message": str(e),
+                                "stack": None,
+                                "type": type(e).__name__,
+                            }
+                        }
+                        if raise_on_error:
+                            # An exception occurred; cancel all futures
+                            error_occurred = True
+                            error_exception = e
+                            break
+                    completed += 1
+                    _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
+            finally:
+                if error_occurred:
+                    # Cancel all pending futures
+                    for future in futures:
+                        future.cancel()
+                    # Shutdown the executor immediately
+                    executor.shutdown(wait=False)
+                    raise error_exception
 
         self.outputs = outputs_buffer
         self.has_run = True
 
-    def run_evaluations(self) -> None:
-        """Run evaluators on the outputs and store the results."""
+        # Log error statistics if any errors occurred
+        error_count = sum(1 for output in self.outputs if output['error']['message'] is not None)
+        if error_count > 0:
+            error_rate = (error_count / total_rows) * 100
+            print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
+
+    def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_error: bool = False) -> "ExperimentResults":
+        """Run evaluators on the outputs and return ExperimentResults.
+        
+        Args:
+            evaluators (Optional[List[Callable]]): List of evaluators to use. If None, uses the experiment's evaluators.
+            raise_on_error (bool): If True, raises exceptions encountered during evaluation.
+        
+        Returns:
+            ExperimentResults: A new ExperimentResults instance with the evaluation results.
+        
+        Raises:
+            ValueError: If task has not been run yet
+        """
         if not self.has_run:
             raise ValueError("Task has not been run yet. Please call run_task() before run_evaluations().")
 
-        self.results = ExperimentResults(self.dataset, self)
-        results_buffer = []
+        # Use provided evaluators or fall back to experiment's evaluators
+        evaluators_to_use = evaluators if evaluators is not None else self.evaluators
+
+        # Validate that all evaluators have the @evaluator decorator
+        for evaluator_func in evaluators_to_use:
+            if not hasattr(evaluator_func, '_is_evaluator'):
+                raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.")
+
+        evaluations = []
         total_rows = len(self.outputs)
         completed = 0
 
@@ -570,7 +719,7 @@ def run_evaluations(self) -> None:
 
         for idx, output_data in enumerate(self.outputs):
             try:
-                # Retrieve output from output_data
+                # Retrieve output from outputs
                 output = output_data["output"]
                 # Get the corresponding dataset row
                 dataset_row = self.dataset[idx]
@@ -578,46 +727,60 @@ def run_evaluations(self) -> None:
                 expected_output = dataset_row.get('expected_output', {})
 
                 # Perform evaluation
-                evaluations = {}
-                for evaluator in self.evaluators:
+                evaluations_dict = {}
+                for evaluator in evaluators_to_use:
                     evaluation_result = evaluator(expected_output, output, input_data)
-                    evaluations[evaluator.__name__] = evaluation_result
+                    evaluations_dict[evaluator.__name__] = evaluation_result
+
+                # Store evaluation results
+                evaluations.append({
+                    "idx": idx,
+                    "evaluations": evaluations_dict,
+                    "error": None,
+                })
 
-                # Prepare result data
-                result = {
-                    "output": output,
-                    "evaluations": evaluations,
-                    "metadata": output_data["metadata"],
-                    "tags": self.tags,
-                    "error": None #TODO: Add error handling
-                }
             except Exception as e:
-                result = {
-                    "output": output_data.get('output'),
+                if raise_on_error:
+                    raise e
+                evaluations.append({
+                    "idx": idx,
                     "evaluations": {},
-                    "metadata": output_data["metadata"],
-                    "tags": self.tags,
-                    "error": str(e),
-                }
+                    "error": {
+                        "message": str(e),
+                        "type": type(e).__name__,
+                        "stack": None,
+                    },
+                })
 
-            results_buffer.append(result)
             completed += 1
             _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete')
 
-        self.has_evaluated = True
-        self.results.experiment_rows = results_buffer
+        # Return new ExperimentResults without modifying the experiment's state
+        return ExperimentResults(self.dataset, self, self.outputs, evaluations)
 
-    def run(self, _jobs: int = 10) -> "ExperimentResults":
-        """Execute the task and evaluations, returning the results."""
-        self.run_task(_jobs=_jobs)
-        self.run_evaluations()
-        print()  # Move to the next line after completion
-        return self.results
+    def run(
+        self,
+        _jobs: int = 10,
+        timeout: Optional[float] = None,
+        retries: int = 0,
+        max_delay: float = 60.0,
+        raise_on_error: bool = False,
+    ) -> "ExperimentResults":
+        """Execute the task and evaluations, returning the results.
 
-    def get_results(self) -> "ExperimentResults":
-        if not self.has_evaluated:
-            raise ValueError("Evaluations have not been performed yet. Please call run() or run_evaluations().")
-        return self.results
+        Args:
+            _jobs (int): Number of worker threads.
+            timeout (float, optional): Time limit for the task execution in seconds.
+            retries (int): Number of retries for failed tasks.
+            max_delay (float): Maximum delay between retries in seconds.
+
+        Returns:
+            ExperimentResults: The results of the experiment.
+        """
+        self.run_task(_jobs=_jobs, timeout=timeout, retries=retries, max_delay=max_delay, raise_on_error=raise_on_error)
+        experiment_results = self.run_evaluations(raise_on_error=raise_on_error)
+        print()  # Move to the next line after completion
+        return experiment_results
 
 
 class ExperimentResults:
@@ -629,22 +792,122 @@ class ExperimentResults:
     Attributes:
         dataset (Dataset): The dataset used in the experiment
         experiment (Experiment): The experiment that generated these results
-        experiment_rows (List[Dict]): Results for each processed record
+        outputs (List[Dict]): Outputs after running the task
+        evaluations (List[Dict]): Evaluation results after running evaluators
     """
 
-    def __init__(self, dataset: Dataset, experiment: Experiment) -> None:
+    def __init__(self, dataset: Dataset, experiment: Experiment, outputs: List[Dict], evaluations: List[Dict]) -> None:
         self.dataset = dataset
         self.experiment = experiment
-        self.experiment_rows = []
+        self.outputs = outputs  # List of outputs from run_task
+        self.evaluations = evaluations  # List of evaluations from run_evaluations
+        self.merged_results = self._merge_results()  # Merged outputs and evaluations
+
+    def _merge_results(self) -> List[Dict[str, Any]]:
+        """Merge outputs and evaluations into a single list of results."""
+        merged_results = []
+        for idx in range(len(self.outputs)):
+            output_data = self.outputs[idx]
+            evaluation_data = self.evaluations[idx]
+            dataset_record = self.dataset[idx]
+
+            merged_result = {
+                "idx": idx,
+                "input": dataset_record.get('input', {}),
+                "expected_output": dataset_record.get('expected_output', {}),
+                "output": output_data.get('output'),
+                "evaluations": evaluation_data.get('evaluations', {}),
+                "metadata": output_data.get('metadata', {}),
+                "error": output_data.get('error'),
+                "tags": self.experiment.tags,
+            }
+            merged_results.append(merged_result)
+        return merged_results
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
-        return iter(self.experiment_rows)
+        return iter(self.merged_results)
 
     def __len__(self) -> int:
-        return len(self.experiment_rows)
+        return len(self.merged_results)
 
     def __getitem__(self, index: int) -> Any:
-        return self.experiment_rows[index]
+        return self.merged_results[index]
+
+    def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
+        """Convert the experiment results to a pandas DataFrame, including the experiment config.
+
+        Args:
+            multiindex (bool): If True, expand nested dictionaries into MultiIndex columns.
+                               If False, keep the nested dictionaries as they are.
+
+        Returns:
+            pd.DataFrame: A DataFrame representation of the experiment results.
+
+        Raises:
+            ImportError: If pandas is not installed.
+        """
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "pandas is required to convert experiment results to DataFrame. "
+                "Please install it with `pip install pandas`"
+            )
+
+        data = []
+
+        for result in self.merged_results:
+            record = {}
+            if multiindex:
+                # Flatten 'input'
+                for k, v in result['input'].items():
+                    record[('input', k)] = v
+                # Flatten 'expected_output'
+                for k, v in result['expected_output'].items():
+                    record[('expected_output', k)] = v
+                # Flatten 'output'
+                output = result.get('output', {})
+                if isinstance(output, dict):
+                    for k, v in output.items():
+                        record[('output', k)] = v
+                else:
+                    record[('output', 'value')] = output
+                # Flatten 'evaluations'
+                for eval_name, eval_result in result['evaluations'].items():
+                    if isinstance(eval_result, dict):
+                        for k, v in eval_result.items():
+                            record[('evaluations', eval_name, k)] = v
+                    else:
+                        record[('evaluations', eval_name)] = eval_result
+                # Flatten 'metadata'
+                for k, v in result.get('metadata', {}).items():
+                    record[('metadata', k)] = v
+                # Include 'config' from the experiment
+                if self.experiment.config:
+                    for k, v in self.experiment.config.items():
+                        record[('config', k)] = v
+                # Flatten 'error'
+                error = result['error']
+                if error:
+                    record[('error', 'message')] = error.get('message')
+                    record[('error', 'type')] = error.get('type')
+                    record[('error', 'stack')] = error.get('stack')
+               
+            else:
+                # Keep nested structures
+                record['input'] = result['input']
+                record['expected_output'] = result['expected_output']
+                record['output'] = result.get('output')
+                record['evaluations'] = result.get('evaluations')
+                record['metadata'] = result.get('metadata')
+                record['config'] = self.experiment.config
+                record['error'] = result.get('error')
+            data.append(record)
+
+        df = pd.DataFrame(data)
+        if multiindex:
+            df.columns = pd.MultiIndex.from_tuples(df.columns)
+        return df
 
     def push(self, overwrite: bool = False) -> None:
         """Push the experiment results to Datadog.
@@ -728,7 +991,6 @@ def push(self, overwrite: bool = False) -> None:
                         "dataset_id": self.experiment.dataset._datadog_dataset_id,
                         "project_id": project_id,
                         "metadata": {
-                            "tags": self.experiment.tags,
                             **self.experiment.metadata,
                             "config": self.experiment.config,
                         },
@@ -744,35 +1006,42 @@ def push(self, overwrite: bool = False) -> None:
 
         spans = []
         metrics = []
-        for idx, result in enumerate(self.experiment_rows):
+        for result in self.merged_results:
+            idx = result['idx']
+            merged_result = result
+            output = merged_result.get('output')
+            evaluations = merged_result.get('evaluations', {})
+            metadata = merged_result.get('metadata', {})
+            error = merged_result.get('error', {})
+
+            # Prepare span data
             span = {
                 "span_id": _make_id(),
                 "project_id": project_id,
                 "experiment_id": experiment_id,
                 "dataset_id": self.experiment.dataset._datadog_dataset_id,
                 "dataset_record_id": _make_id(),
-                "start_ns": int(result["metadata"]["timestamp"] * 1e9),
-                "duration": float(result["metadata"]["duration"] * 1e9),
-                "tags": self.experiment.tags,
-                "status": "ok",
+                "start_ns": int(metadata.get("timestamp", time.time()) * 1e9),
+                "duration": float(metadata.get("duration", 0) * 1e9),
+                "status": "ok" if not error else "error",
                 "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
                 "meta": {
                     "span": {"kind": "experiment"},
-                    "input": self.experiment.dataset[idx]["input"],
-                    "output": result["output"],
-                    "expected_output": self.experiment.dataset[idx].get("expected_output", {}),
+                    "input": merged_result.get('input', {}),
+                    "output": output,
+                    "expected_output": merged_result.get('expected_output', {}),
                     "error": {
-                        "message": result["error"],
-                        "stack": None,
-                        "type": None,
-                    },
+                        "message": error.get("message"),
+                        "type": error.get("type"),
+                        "stack": error.get("stack"),
+                    }
                 },
             }
             spans.append(span)
 
             # Add evaluation metrics
-            for metric_name, metric_value in result["evaluations"].items():
-                timestamp_ms = int(result["metadata"]["timestamp"] * 1000)
+            for metric_name, metric_value in evaluations.items():
+                timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
 
                 # Check for bool first, since bool is a subclass of int
                 if isinstance(metric_value, bool):
@@ -792,115 +1061,24 @@ def push(self, overwrite: bool = False) -> None:
                     "score_value" if metric_type == "score" else "categorical_value": metric_value,
                 }
 
-                if metric_type == "score":
-                    metric["score_value"] = metric_value
-                else:
-                    metric["categorical_value"] = metric_value
-
                 metrics.append(metric)
 
+        # Prepare payload and send to Datadog
         results_payload = {
             "data": {
                 "type": "experiments",
+                "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__],
                 "attributes": {"spans": spans, "metrics": metrics},
             }
         }
 
+        print(json.dumps(results_payload, indent=2))
+
         url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
         exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
 
-    def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
-        """Convert the experiment results to a pandas DataFrame, including the experiment config.
-
-        Args:
-            multiindex (bool): If True, expand nested dictionaries into MultiIndex columns.
-                               If False, keep the nested dictionaries as they are.
-
-        Returns:
-            pd.DataFrame: A DataFrame representation of the experiment results.
-
-        Raises:
-            ImportError: If pandas is not installed.
-        """
-        try:
-            import pandas as pd
-        except ImportError:
-            raise ImportError(
-                "pandas is required to convert experiment results to DataFrame. "
-                "Please install it with `pip install pandas`"
-            )
-
-        # Collect data
-        data = []
-        for result in self.experiment_rows:
-            record = {}
-            # Get index of the dataset record
-            idx = result['metadata'].get('dataset_record_idx')
-            dataset_record = self.dataset[idx]
-
-            if multiindex:
-
-                # Flatten 'input' and 'expected_output' from the dataset
-                for k, v in dataset_record.get('input', {}).items():
-                    record[('input', k)] = v
-                for k, v in dataset_record.get('expected_output', {}).items():
-                    record[('expected_output', k)] = v
-
-                # Flatten 'output' from the result
-                output = result.get('output', {})
-                if isinstance(output, dict):
-                    for k, v in output.items():
-                        record[('output', k)] = v
-                else:
-                    record[('output', 'value')] = output
-
-                # Flatten 'evaluations' from the result
-                evaluations = result.get('evaluations', {})
-                for evaluator_name, evaluation in evaluations.items():
-                    if isinstance(evaluation, dict):
-                        for k, v in evaluation.items():
-                            record[('evaluations', evaluator_name, k)] = v
-                    else:
-                        record[('evaluations', evaluator_name)] = evaluation
-
-                 # Flatten 'config' from the experiment, if it exists
-                if self.experiment.config:
-                    for k, v in self.experiment.config.items():
-                        record[('config', k)] = v
-
-                # Flatten 'metadata' from the result
-                for k, v in result.get('metadata', {}).items():
-                    # Skip project_name, experiment_name, and dataset_name
-                    if k not in ['project_name', 'experiment_name', 'dataset_name']:
-                        record[('metadata', k)] = v
-
-
-                # Include 'error' if any
-                error = result.get('error')
-                if error:
-                    record[('error', 'message')] = error
-
-            else:
-                # Include config as a dictionary, if it exists
-                if self.experiment.config:
-                    record['config'] = self.experiment.config
-
-                # Keep nested dictionaries
-                record['input'] = dataset_record.get('input', {})
-                record['expected_output'] = dataset_record.get('expected_output', {})
-                record['output'] = result.get('output', {})
-                record['evaluations'] = result.get('evaluations', {})
-                record['metadata'] = result.get('metadata', {})
-                record['tags'] = result.get('tags', [])
-                record['error'] = result.get('error')
-
-            data.append(record)
-
-        df = pd.DataFrame(data)
-        if multiindex:
-            # Set columns as MultiIndex
-            df.columns = pd.MultiIndex.from_tuples(df.columns)
-        return df
+        # Print URL to the experiment in Datadog
+        print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}")
 
     def export_to_jsonl(self, file_path):
         """
@@ -912,7 +1090,7 @@ def export_to_jsonl(self, file_path):
         import json
 
         with open(file_path, 'w') as f:
-            for result in self.experiment_rows:
+            for result in self.merged_results:
                 json_line = json.dumps(result)
                 f.write(json_line + '\n')
 
@@ -946,6 +1124,8 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
     }
     url = BASE_URL + url
     resp = http_request(method, url, headers=headers, body=body)
+    if resp.status_code == 403:
+        raise ValueError("API key or application key is incorrect.")
     if resp.status_code >= 400:
         try:
             error_details = resp.json()
@@ -992,6 +1172,15 @@ def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any]
     return wrapper
 
 
+def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'):
+    percent = f"{100 * (iteration / float(total)):.{decimals}f}"
+    filled_length = int(length * iteration // total)
+    bar = fill * filled_length + '-' * (length - filled_length)
+    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')
+    if iteration == total:
+        print()
+
+
 class ExperimentGrid:
     """Class to run a grid of experiments over multiple parameter combinations.
 
@@ -1097,12 +1286,3 @@ def get_all_results(self) -> List[ExperimentResults]:
             List[ExperimentResults]: A list of results for each experiment.
         """
         return self.results
-
-
-def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'):
-    percent = f"{100 * (iteration / float(total)):.{decimals}f}"
-    filled_length = int(length * iteration // total)
-    bar = fill * filled_length + '-' * (length - filled_length)
-    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')
-    if iteration == total:
-        print()

From 2608ba5734a9d299d62272ecb447ee5240bf7b32 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Sun, 10 Nov 2024 17:54:43 -0500
Subject: [PATCH 19/36] fix import

---
 tests/appsec/iast/fixtures/propagation_path.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/appsec/iast/fixtures/propagation_path.py b/tests/appsec/iast/fixtures/propagation_path.py
index d645e781e3f..7dcaa737995 100644
--- a/tests/appsec/iast/fixtures/propagation_path.py
+++ b/tests/appsec/iast/fixtures/propagation_path.py
@@ -2,12 +2,13 @@
 CAVEAT: the line number is important to some IAST tests, be careful to modify this file and update the tests if you
 make some changes
 """
-import _io
 import asyncio
 import os
 import re
 import sys
 
+import _io
+
 
 ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 

From 5cbfd70a57958f9a6fc7a3f78cb146f532f2c606 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Sun, 10 Nov 2024 18:43:42 -0500
Subject: [PATCH 20/36] docstring

---
 ddtrace/llmobs/_experiments.py | 90 +++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 9c6f3f3b3c9..1a30e3c0780 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -213,6 +213,24 @@ def from_csv(
         expected_output_columns: List[str] = None,
         metadata_columns: List[str] = None,
     ) -> "Dataset":
+        """Create a Dataset from a CSV file.
+
+        Args:
+            filepath: Path to the CSV file
+            name: Name of the dataset
+            description: Optional description of the dataset
+            delimiter: CSV delimiter character, defaults to comma
+            input_columns: List of column names to use as input data
+            expected_output_columns: List of column names to use as expected output data
+            metadata_columns: Optional list of column names to include as metadata
+
+        Returns:
+            Dataset: A new Dataset instance containing the CSV data
+
+        Raises:
+            ValueError: If input_columns or expected_output_columns are not provided
+            Exception: If there are issues reading the CSV file
+        """
         if input_columns is None or expected_output_columns is None:
             raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
 
@@ -258,6 +276,23 @@ def from_csv(
 
     @classmethod
     def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+        """Create a Dataset from a JSONL file.
+
+        Args:
+            filepath: Path to the JSONL file
+            name: Name of the dataset
+            description: Optional description of the dataset
+            input_columns: List of column names to use as input data
+            expected_output_columns: List of column names to use as expected output data
+            metadata_columns: Optional list of column names to include as metadata
+
+        Returns:
+            Dataset: A new Dataset instance containing the JSONL data
+
+        Raises:
+            ValueError: If input_columns or expected_output_columns are not provided
+            Exception: If there are issues reading the JSONL file
+        """
         if input_columns is None or expected_output_columns is None:
             raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
 
@@ -289,6 +324,25 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum
 
     @classmethod
     def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+        """Create a Dataset from a Parquet file.
+
+        Args:
+            filepath: Path to the Parquet file
+            name: Name of the dataset
+            description: Optional description of the dataset
+            input_columns: List of column names to use as input data
+            expected_output_columns: List of column names to use as expected output data
+            metadata_columns: Optional list of column names to include as metadata
+
+        Returns:
+            Dataset: A new Dataset instance containing the Parquet data
+
+        Raises:
+            ImportError: If pandas is not installed
+            ValueError: If input_columns or expected_output_columns are not provided,
+                       if the Parquet file is empty, or if specified columns are missing
+            Exception: If there are issues reading the Parquet file
+        """
         try:
             import pandas as pd
         except ImportError:
@@ -340,6 +394,24 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col
 
     @classmethod
     def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
+        """Import a dataset from a file.
+
+        Args:
+            path (str): Path to the input file
+            filetype (FileType): Type of file to import (CSV, JSONL, or PARQUET)
+            name (str): Name of the dataset
+            description (str, optional): Description of the dataset. Defaults to "".
+            input_columns (List[str], optional): List of column names to use as input data. Required for CSV and PARQUET files.
+            expected_output_columns (List[str], optional): List of column names to use as expected output data. Required for CSV and PARQUET files.
+            metadata_columns (List[str], optional): List of column names to include as metadata. Defaults to None.
+            delimiter (str, optional): Delimiter character for CSV files. Defaults to ",".
+
+        Returns:
+            Dataset: A new Dataset instance containing the imported data
+
+        Raises:
+            ValueError: If filetype is not supported or if required columns are missing
+        """
         if filetype == FileType.CSV:
             return cls.from_csv(
                 filepath=path,
@@ -499,7 +571,21 @@ def run_task(
         max_delay: float = 60.0,
         raise_on_error: bool = False,
     ) -> None:
-        """Execute the task function on the dataset and store the outputs."""
+        """Execute the task function on the dataset and store the outputs.
+
+        Args:
+            _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10.
+            timeout: Maximum time in seconds to wait for each task execution. 
+                    If None, will wait indefinitely. Defaults to None.
+            retries: Number of retry attempts for failed tasks. Defaults to 0.
+            max_delay: Maximum delay in seconds between retries using exponential backoff.
+                      Defaults to 60 seconds.
+            raise_on_error: If True, raises exceptions from failed tasks. If False, stores
+                          errors in the output. Defaults to False.
+
+        Raises:
+            ValueError: If _jobs is not between 1 and 20, or if retries is negative.
+        """
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
         if retries < 0:
@@ -773,6 +859,8 @@ def run(
             timeout (float, optional): Time limit for the task execution in seconds.
             retries (int): Number of retries for failed tasks.
             max_delay (float): Maximum delay between retries in seconds.
+            raise_on_error (bool): If True, raises exceptions from failed tasks. If False, stores
+                                  errors in the output. Defaults to False.
 
         Returns:
             ExperimentResults: The results of the experiment.

From bed12614d0a45ebc9a2b86f956e428d7a9d385f6 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Sun, 10 Nov 2024 18:58:03 -0500
Subject: [PATCH 21/36] Custom Exception classes

---
 ddtrace/llmobs/_experiments.py | 60 ++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 1a30e3c0780..c9b0846b604 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -269,8 +269,14 @@ def from_csv(
                         'expected_output': expected_output_data,
                         **metadata,
                     })
+        except FileNotFoundError as e:
+            raise DatasetFileError(f"CSV file not found: {filepath}") from e
+        except PermissionError as e:
+            raise DatasetFileError(f"Permission denied when reading CSV file: {filepath}") from e
+        except csv.Error as e:
+            raise DatasetFileError(f"Error parsing CSV file: {e}") from e
         except Exception as e:
-            raise Exception(f"Failed to read CSV file: {e}")
+            raise DatasetFileError(f"Unexpected error reading CSV file: {e}") from e
 
         return cls(name=name, data=data, description=description)
 
@@ -317,8 +323,14 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum
                 if not data:
                     raise ValueError("JSONL file is empty.")
 
+        except FileNotFoundError as e:
+            raise DatasetFileError(f"JSONL file not found: {filepath}") from e
+        except PermissionError as e:
+            raise DatasetFileError(f"Permission denied when reading JSONL file: {filepath}") from e
+        except json.JSONDecodeError as e:
+            raise DatasetFileError(f"Error parsing JSONL file: {e}") from e
         except Exception as e:
-            raise Exception(f"Failed to read JSONL file: {e}")
+            raise DatasetFileError(f"Unexpected error reading JSONL file: {e}") from e
 
         return cls(name=name, data=data, description=description)
 
@@ -387,8 +399,12 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col
                     **metadata,
                 })
 
+        except FileNotFoundError as e:
+            raise DatasetFileError(f"Parquet file not found: {filepath}") from e
+        except PermissionError as e:
+            raise DatasetFileError(f"Permission denied when reading Parquet file: {filepath}") from e
         except Exception as e:
-            raise Exception(f"Failed to read Parquet file: {e}")
+            raise DatasetFileError(f"Error reading Parquet file: {e}") from e
 
         return cls(name=name, data=data, description=description)
 
@@ -485,9 +501,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
             # Set columns as MultiIndex
             df.columns = pd.MultiIndex.from_tuples(df.columns)
             return df
-        else:
-            # Keep 'input' and 'expected_output' as dicts in the DataFrame
-            return pd.DataFrame(self._data)
+        return pd.DataFrame(self._data)
 
     def export_to_jsonl(self, file_path):
         """
@@ -608,8 +622,7 @@ def process_row(idx_row):
                     def execute_task():
                         if getattr(self.task, '_accepts_config', False):
                             return self.task(input_data, self.config)
-                        else:
-                            return self.task(input_data)
+                        return self.task(input_data)
 
                     # Use ThreadPoolExecutor to enforce timeout
                     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor:
@@ -645,8 +658,8 @@ def execute_task():
 
                 except concurrent.futures.TimeoutError as e:
                     if raise_on_error:
-                        # Reraise the exception to trigger cancellation
-                        raise Exception(f"TimeoutError in task for row {idx}: {e}") from e
+                        # Raise specific experiment task error
+                        raise ExperimentTaskError(f"Task timed out after {timeout} seconds", idx, e)
                     if attempt < retries:
                         # Exponential backoff and retry
                         sleep_time = min(delay, max_delay)
@@ -667,7 +680,7 @@ def execute_task():
                                 "dataset_name": self.dataset.name,
                             },
                             "error": {
-                                "message": "Task timed out",
+                                "message": f"Task timed out after {timeout} seconds",
                                 "stack": None,
                                 "type": "TimeoutError",
                             }
@@ -676,9 +689,8 @@ def execute_task():
 
                 except Exception as e:
                     if raise_on_error:
-                        # Reraise the exception to trigger cancellation
-                        error_type = type(e).__name__
-                        raise Exception(f"Exception in task for row {idx}: {error_type}: {e}") from e
+                        # Raise specific experiment task error
+                        raise ExperimentTaskError(str(e), idx, e)
                     if attempt < retries:
                         # Exponential backoff and retry
                         sleep_time = min(delay, max_delay)
@@ -1210,8 +1222,8 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
         "DD-APPLICATION-KEY": os.getenv("DD_APPLICATION_KEY"),
         "Content-Type": "application/json",
     }
-    url = BASE_URL + url
-    resp = http_request(method, url, headers=headers, body=body)
+    full_url = BASE_URL + url
+    resp = http_request(method, full_url, headers=headers, body=body)
     if resp.status_code == 403:
         raise ValueError("API key or application key is incorrect.")
     if resp.status_code >= 400:
@@ -1233,8 +1245,7 @@ def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> A
         # Call the original function with or without config
         if 'config' in inspect.signature(func).parameters:
             return func(input, config)
-        else:
-            return func(input)
+        return func(input)
     # Enforce signature compliance
     sig = inspect.signature(func)
     params = sig.parameters
@@ -1374,3 +1385,16 @@ def get_all_results(self) -> List[ExperimentResults]:
             List[ExperimentResults]: A list of results for each experiment.
         """
         return self.results
+
+
+class DatasetFileError(Exception):
+    """Exception raised when there are errors reading or processing dataset files."""
+    pass
+
+
+class ExperimentTaskError(Exception):
+    """Exception raised when a task fails during experiment execution."""
+    def __init__(self, message: str, row_idx: int, original_error: Exception = None):
+        self.row_idx = row_idx
+        self.original_error = original_error
+        super().__init__(f"Task failed on row {row_idx}: {message}")

From 0928224ce247ebb2287b0bbf9aa61ac98f64d0e9 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 11 Nov 2024 18:29:35 -0500
Subject: [PATCH 22/36] handle duration errors

---
 ddtrace/llmobs/_experiments.py | 38 ++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index c9b0846b604..6202bb28074 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,8 +1,6 @@
 # TODO: Test failures on badly defined evaluators
 # TODO: Test workflows for re-evals and publishing results
-# TODO: Handle behavior pushing experiment results without dataset
 # TODO: Idempotency of push/pull methods
-# TODO: Support running on subsets of datasets 
 
 import concurrent.futures
 from datetime import datetime
@@ -614,10 +612,11 @@ def process_row(idx_row):
             delay = 1.0  # Initial delay in seconds
 
             while attempt <= retries:
+                start_time = time.time()
                 try:
                     # Extract the input data
                     input_data = row['input']
-                    start_time = time.time()
+                    
 
                     def execute_task():
                         if getattr(self.task, '_accepts_config', False):
@@ -629,8 +628,6 @@ def execute_task():
                         future = single_executor.submit(execute_task)
                         output = future.result(timeout=timeout)
 
-                    end_time = time.time()
-                    duration = end_time - start_time
 
                     # Ensure output is a dictionary
                     if not isinstance(output, dict):
@@ -642,7 +639,7 @@ def execute_task():
                         "output": output,
                         "metadata": {
                             "timestamp": start_time,
-                            "duration": duration,
+                            "duration": time.time() - start_time,
                             "dataset_record_idx": idx,
                             "project_name": self.project_name,
                             "experiment_name": self.name,
@@ -672,8 +669,8 @@ def execute_task():
                             "idx": idx,
                             "output": None,
                             "metadata": {
-                                "timestamp": time.time(),
-                                "duration": 0,
+                                "timestamp": start_time,
+                                "duration": time.time() - start_time,
                                 "dataset_record_idx": idx,
                                 "project_name": self.project_name,
                                 "experiment_name": self.name,
@@ -703,8 +700,8 @@ def execute_task():
                             "idx": idx,
                             "output": None,
                             "metadata": {
-                                "timestamp": time.time(),
-                                "duration": 0,
+                                "timestamp": start_time,
+                                "duration": time.time() - start_time,
                                 "dataset_record_idx": idx,
                                 "project_name": self.project_name,
                                 "experiment_name": self.name,
@@ -733,6 +730,7 @@ def execute_task():
             try:
                 for future in concurrent.futures.as_completed(futures):
                     idx = futures[future]
+                    start_time = time.time()
                     try:
                         output_data = future.result()
                         outputs_buffer[idx] = output_data
@@ -746,8 +744,8 @@ def execute_task():
                             "idx": idx,
                             "output": None,
                             "metadata": {
-                                "timestamp": time.time(),
-                                "duration": 0,
+                                "timestamp": start_time,
+                                "duration": time.time() - start_time,
                                 "dataset_record_idx": idx,
                                 "project_name": self.project_name,
                                 "experiment_name": self.name,
@@ -970,8 +968,6 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
                 if isinstance(output, dict):
                     for k, v in output.items():
                         record[('output', k)] = v
-                else:
-                    record[('output', 'value')] = output
                 # Flatten 'evaluations'
                 for eval_name, eval_result in result['evaluations'].items():
                     if isinstance(eval_result, dict):
@@ -1004,6 +1000,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
                 record['error'] = result.get('error')
             data.append(record)
 
+        
         df = pd.DataFrame(data)
         if multiindex:
             df.columns = pd.MultiIndex.from_tuples(df.columns)
@@ -1110,17 +1107,22 @@ def push(self, overwrite: bool = False) -> None:
             idx = result['idx']
             merged_result = result
             output = merged_result.get('output')
+            input = merged_result.get('input', {})
             evaluations = merged_result.get('evaluations', {})
+            expected_output = merged_result.get('expected_output', {})
             metadata = merged_result.get('metadata', {})
             error = merged_result.get('error', {})
 
-            # Prepare span data
+            # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id
+            dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest()
+
             span = {
                 "span_id": _make_id(),
                 "project_id": project_id,
                 "experiment_id": experiment_id,
                 "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                "dataset_record_id": _make_id(),
+                #TODO: Extract the record id from the dataset for hosted datasets
+                "dataset_record_id": dataset_record_id,
                 "start_ns": int(metadata.get("timestamp", time.time()) * 1e9),
                 "duration": float(metadata.get("duration", 0) * 1e9),
                 "status": "ok" if not error else "error",
@@ -1373,8 +1375,8 @@ def run(self, _jobs: int = 10):
             _jobs (int): Number of parallel workers for each experiment run.
         """
         for experiment in self.experiments:
-            experiment.run(_jobs=_jobs)
-            self.results.append(experiment.get_results())
+            results = experiment.run(_jobs=_jobs)
+            self.results.append(results)
 
         return self.results
 

From cac1476b4c4311d71fa34219d9904dfbab10db3e Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 12 Nov 2024 12:30:43 -0500
Subject: [PATCH 23/36] more stuff

---
 ddtrace/llmobs/_experiments.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 6202bb28074..24899214eb7 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -279,7 +279,7 @@ def from_csv(
         return cls(name=name, data=data, description=description)
 
     @classmethod
-    def from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+    def _from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
         """Create a Dataset from a JSONL file.
 
         Args:
@@ -333,7 +333,7 @@ def from_jsonl(cls, filepath: str, name: str, description: str = "", input_colum
         return cls(name=name, data=data, description=description)
 
     @classmethod
-    def from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
+    def _from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
         """Create a Dataset from a Parquet file.
 
         Args:
@@ -407,7 +407,7 @@ def from_parquet(cls, filepath: str, name: str, description: str = "", input_col
         return cls(name=name, data=data, description=description)
 
     @classmethod
-    def import_file(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
+    def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
         """Import a dataset from a file.
 
         Args:
@@ -437,7 +437,7 @@ def import_file(cls, path: str, filetype: FileType, name: str, description: str
                 metadata_columns=metadata_columns,
             )
         elif filetype == FileType.JSONL:
-            return cls.from_jsonl(
+            return cls._from_jsonl(
                 filepath=path,
                 name=name,
                 description=description,
@@ -446,7 +446,7 @@ def import_file(cls, path: str, filetype: FileType, name: str, description: str
                 metadata_columns=metadata_columns,
             )
         elif filetype == FileType.PARQUET:
-            return cls.from_parquet(
+            return cls._from_parquet(
                 filepath=path,
                 name=name,
                 description=description,
@@ -578,9 +578,9 @@ def __init__(
     def run_task(
         self,
         _jobs: int = 10,
-        timeout: Optional[float] = None,
-        retries: int = 0,
-        max_delay: float = 60.0,
+        _timeout: Optional[float] = None,
+        _retries: int = 0,
+        _max_delay: float = 60.0,
         raise_on_error: bool = False,
     ) -> None:
         """Execute the task function on the dataset and store the outputs.
@@ -875,7 +875,7 @@ def run(
         Returns:
             ExperimentResults: The results of the experiment.
         """
-        self.run_task(_jobs=_jobs, timeout=timeout, retries=retries, max_delay=max_delay, raise_on_error=raise_on_error)
+        self.run_task(_jobs=_jobs, _timeout=timeout, _retries=retries, _max_delay=max_delay, raise_on_error=raise_on_error)
         experiment_results = self.run_evaluations(raise_on_error=raise_on_error)
         print()  # Move to the next line after completion
         return experiment_results

From 9024e14ba3bfe5b4bbdcbc88b48752d87001c048 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Thu, 14 Nov 2024 11:46:08 -0500
Subject: [PATCH 24/36] support polymorphic i/o

---
 ddtrace/llmobs/_experiments.py | 412 ++++++++++++++++++++++++---------
 1 file changed, 308 insertions(+), 104 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 24899214eb7..1b0a3987d06 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,13 +1,12 @@
-# TODO: Test failures on badly defined evaluators
+# TODO: Test failures on eval, how do we set errors
 # TODO: Test workflows for re-evals and publishing results
-# TODO: Idempotency of push/pull methods
 
 import concurrent.futures
 from datetime import datetime
 import json
 import os
 import time
-from typing import Any, Callable, Dict, Iterator, List, Optional
+from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 import inspect
 from functools import wraps
 from urllib.parse import quote
@@ -43,7 +42,14 @@ class Dataset:
         description (str): Optional description of the dataset
     """
 
-    def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "") -> None:
+    def __init__(self, name: str, data: List[Dict[str, Union[str, Dict[str, Any]]]], description: str = "") -> None:
+        """
+        Args:
+            name: Name of the dataset
+            data: List of dictionaries where 'input' and 'expected_output' values can be
+                 either strings or dictionaries of strings
+            description: Optional description of the dataset
+        """
         self.name = name
         self.description = description
         self._validate_data(data)
@@ -52,16 +58,36 @@ def __init__(self, name: str, data: List[Dict[str, Any]], description: str = "")
         # Post-push attributes
         self._datadog_dataset_id = None
 
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
+    def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]:
         return iter(self._data)
 
     def __len__(self) -> int:
         return len(self._data)
 
-    def __getitem__(self, index: int) -> Dict[str, Any]:
-        return self._data[index]
-
-    def _validate_data(self, data: List[Dict[str, Any]]) -> None:
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Dict[str, Any]]]:
+        """Get a dataset record, converting _str_value dictionaries back to strings.
+        
+        Args:
+            index: Index of the record to retrieve
+            
+        Returns:
+            Dict containing the record with any _str_value values converted to strings
+        """
+        record = self._data[index].copy()
+        
+        # Convert input if it has _str_value
+        if 'input' in record and isinstance(record['input'], dict):
+            if '_str_value' in record['input'] and len(record['input']) == 1:
+                record['input'] = record['input']['_str_value']
+                
+        # Convert expected_output if it has _str_value
+        if 'expected_output' in record and isinstance(record['expected_output'], dict):
+            if '_str_value' in record['expected_output'] and len(record['expected_output']) == 1:
+                record['expected_output'] = record['expected_output']['_str_value']
+                
+        return record
+
+    def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> None:
         """Validate the format and structure of dataset records.
 
         Args:
@@ -69,8 +95,7 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None:
 
         Raises:
             ValueError: If data is empty, contains non-dictionary rows,
-                       has inconsistent keys, contains nested dictionaries,
-                       or exceeds 50,000 rows
+                       has inconsistent keys, or exceeds 50,000 rows
         """
         if not data:
             raise ValueError("Data cannot be empty.")
@@ -86,20 +111,27 @@ def _validate_data(self, data: List[Dict[str, Any]]) -> None:
             if set(row.keys()) != first_row_keys:
                 raise ValueError("All rows must have the same keys.")
 
-            # Validate that 'input' exists and is a dictionary
-            if 'input' not in row:
-                raise ValueError("Each row must contain an 'input' field")
-            if not isinstance(row['input'], dict):
-                raise ValueError("The 'input' field must be a dictionary")
-
-            # If expected_output exists, validate it's a dictionary
-            if 'expected_output' in row and not isinstance(row['expected_output'], dict):
-                raise ValueError("The 'expected_output' field must be a dictionary")
-
-            # Check that 'input' and 'expected_output' are flat dictionaries
-            for key in ["input", "expected_output"]:
-                if key in row and any(isinstance(value, dict) for value in row[key].values()):
-                    raise ValueError(f"'{key}' must be a flat dictionary (no nested dictionaries).")
+            # Validate input if present
+            if 'input' in row:
+                if isinstance(row['input'], str):
+                    # Convert string to dict with _str_value key
+                    row['input'] = {'_str_value': row['input']}
+                elif isinstance(row['input'], dict):
+                    # Do nothing
+                    pass
+                else:
+                    raise ValueError("The 'input' field must be either a string or a dictionary")
+
+            # Validate expected_output if present
+            if 'expected_output' in row:
+                if isinstance(row['expected_output'], str):
+                    # Convert string to dict with _str_value key
+                    row['expected_output'] = {'_str_value': row['expected_output']}
+                elif isinstance(row['expected_output'], dict):
+                    # Do nothing
+                    pass
+                else:
+                    raise ValueError("The 'expected_output' field must be either a string or a dictionary")
 
     @classmethod
     def pull(cls, name: str) -> "Dataset":
@@ -137,13 +169,26 @@ def pull(cls, name: str) -> "Dataset":
         class_records = []
         for record in records_data.get("data", []):
             attrs = record.get("attributes", {})
-            class_records.append(
-                {
-                    "input": attrs.get("input", {}),
-                    "expected_output": attrs.get("expected_output", {}),
-                    **attrs.get("metadata", {}),
-                }
-            )
+            input_data = attrs.get("input")
+            expected_output = attrs.get("expected_output")
+
+            print(input_data, expected_output)
+            
+            # Handle input data format
+            if isinstance(input_data, str):
+                input_data = {'_str_value': input_data}
+            # For dictionaries, keep as-is (no conversion needed)
+                
+            # Handle expected output format
+            if isinstance(expected_output, str):
+                expected_output = {'_str_value': expected_output}
+            # For dictionaries, keep as-is (no conversion needed)
+                
+            class_records.append({
+                "input": input_data,
+                "expected_output": expected_output,
+                **attrs.get("metadata", {}),
+            })
 
         # Create new dataset instance
         dataset = cls(name, class_records)
@@ -154,7 +199,7 @@ def push(self) -> None:
         """Push the dataset to Datadog.
 
         Returns:
-            Dict[str, str]: Dictionary containing dataset information including:
+            Dict[str, Any]: Dictionary containing dataset information including:
                 - dataset_id: The ID of the created/updated dataset
                 - dataset_name: The name of the dataset
                 - record_count: Number of records uploaded
@@ -256,8 +301,18 @@ def from_csv(
                     raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")
 
                 for row in rows:
-                    input_data = {col: row[col] for col in input_columns}
-                    expected_output_data = {col: row[col] for col in expected_output_columns}
+                    # If single column, use string value wrapped in dict
+                    if len(input_columns) == 1:
+                        input_data = {'_str_value': row[input_columns[0]]}
+                    else:
+                        input_data = {col: row[col] for col in input_columns}
+
+                    # If single column, use string value wrapped in dict
+                    if len(expected_output_columns) == 1:
+                        expected_output_data = {'_str_value': row[expected_output_columns[0]]}
+                    else:
+                        expected_output_data = {col: row[col] for col in expected_output_columns}
+
                     metadata = {}
                     if metadata_columns:
                         metadata = {col: row[col] for col in metadata_columns}
@@ -479,27 +534,70 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
             )
 
         if multiindex:
-            # Create a list of flattened dictionaries
-            flattened_data = []
+            column_tuples = set()
+            data_rows = []
             for record in self._data:
                 flat_record = {}
+
                 # Handle 'input' fields
-                for k, v in record.get('input', {}).items():
-                    flat_record[('input', k)] = v
+                input_data = record.get('input', {})
+                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
+                    flat_record[('input', '')] = input_data['_str_value']
+                    column_tuples.add(('input', ''))
+                else:
+                    for k, v in input_data.items():
+                        flat_record[('input', k)] = v
+                        column_tuples.add(('input', k))
+
                 # Handle 'expected_output' fields
-                for k, v in record.get('expected_output', {}).items():
-                    flat_record[('expected_output', k)] = v
+                expected_output = record.get('expected_output', {})
+                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
+                    flat_record[('expected_output', '')] = expected_output['_str_value']
+                    column_tuples.add(('expected_output', ''))
+                else:
+                    for k, v in expected_output.items():
+                        flat_record[('expected_output', k)] = v
+                        column_tuples.add(('expected_output', k))
+
                 # Handle any other top-level fields
                 for k, v in record.items():
                     if k not in ['input', 'expected_output']:
                         flat_record[('metadata', k)] = v
-                flattened_data.append(flat_record)
+                        column_tuples.add(('metadata', k))
+                data_rows.append(flat_record)
+
+            # Convert column_tuples to a sorted list to maintain consistent column order
+            column_tuples = sorted(list(column_tuples))
+
+            # Build the DataFrame
+            records_list = []
+            for flat_record in data_rows:
+                row = [flat_record.get(col, None) for col in column_tuples]
+                records_list.append(row)
+
+            df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
 
-            df = pd.DataFrame(flattened_data)
-            # Set columns as MultiIndex
-            df.columns = pd.MultiIndex.from_tuples(df.columns)
             return df
-        return pd.DataFrame(self._data)
+
+        else:
+            # For non-multiindex, convert _str_value in the nested structures
+            data = []
+            for record in self._data:
+                new_record = {}
+                input_data = record.get('input', {})
+                new_record['input'] = (input_data['_str_value'] 
+                                     if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 
+                                     else input_data)
+                expected_output = record.get('expected_output', {})
+                new_record['expected_output'] = (expected_output['_str_value']
+                                               if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1
+                                               else expected_output)
+                # Copy other fields
+                for k, v in record.items():
+                    if k not in ['input', 'expected_output']:
+                        new_record[k] = v
+                data.append(new_record)
+            return pd.DataFrame(data)
 
     def export_to_jsonl(self, file_path):
         """
@@ -600,7 +698,7 @@ def run_task(
         """
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
-        if retries < 0:
+        if _retries < 0:
             raise ValueError("Number of retries must be non-negative")
         self.outputs = []
         total_rows = len(self.dataset)
@@ -611,12 +709,13 @@ def process_row(idx_row):
             attempt = 0
             delay = 1.0  # Initial delay in seconds
 
-            while attempt <= retries:
+            while attempt <= _retries:
                 start_time = time.time()
                 try:
-                    # Extract the input data
+                    # Extract the input data and convert if it's a _str_value dict
                     input_data = row['input']
-                    
+                    if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
+                        input_data = input_data['_str_value']
 
                     def execute_task():
                         if getattr(self.task, '_accepts_config', False):
@@ -626,11 +725,12 @@ def execute_task():
                     # Use ThreadPoolExecutor to enforce timeout
                     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor:
                         future = single_executor.submit(execute_task)
-                        output = future.result(timeout=timeout)
+                        output = future.result(timeout=_timeout)
 
-
-                    # Ensure output is a dictionary
-                    if not isinstance(output, dict):
+                    # Ensure output is a dictionary with _str_value for strings
+                    if isinstance(output, str):
+                        output = {'_str_value': output}
+                    elif not isinstance(output, dict):
                         output = {'value': output}
 
                     # Prepare output data
@@ -654,12 +754,13 @@ def execute_task():
                     return output_data
 
                 except concurrent.futures.TimeoutError as e:
+                    print(f"Timeout error: {e}")
                     if raise_on_error:
                         # Raise specific experiment task error
-                        raise ExperimentTaskError(f"Task timed out after {timeout} seconds", idx, e)
-                    if attempt < retries:
+                        raise ExperimentTaskError(f"Task timed out after {_timeout} seconds", idx, e)
+                    if attempt < _retries:
                         # Exponential backoff and retry
-                        sleep_time = min(delay, max_delay)
+                        sleep_time = min(delay, _max_delay)
                         time.sleep(sleep_time)
                         delay *= 2
                         attempt += 1
@@ -677,7 +778,7 @@ def execute_task():
                                 "dataset_name": self.dataset.name,
                             },
                             "error": {
-                                "message": f"Task timed out after {timeout} seconds",
+                                "message": f"Task timed out after {_timeout} seconds",
                                 "stack": None,
                                 "type": "TimeoutError",
                             }
@@ -685,12 +786,13 @@ def execute_task():
                         return output_data
 
                 except Exception as e:
+                    print(f"Error: {e}")
                     if raise_on_error:
                         # Raise specific experiment task error
                         raise ExperimentTaskError(str(e), idx, e)
-                    if attempt < retries:
+                    if attempt < _retries:
                         # Exponential backoff and retry
-                        sleep_time = min(delay, max_delay)
+                        sleep_time = min(delay, _max_delay)
                         time.sleep(sleep_time)
                         delay *= 2
                         attempt += 1
@@ -740,6 +842,7 @@ def execute_task():
                             error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}")
                             break
                     except Exception as e:
+                        print(f"Error: {e}")
                         outputs_buffer[idx] = {
                             "idx": idx,
                             "output": None,
@@ -766,6 +869,7 @@ def execute_task():
                     _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
             finally:
                 if error_occurred:
+                    print(f"Error occurred: {error_exception}")
                     # Cancel all pending futures
                     for future in futures:
                         future.cancel()
@@ -810,23 +914,37 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_
         total_rows = len(self.outputs)
         completed = 0
 
-        # Initialize the progress bar
         _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete')
 
         for idx, output_data in enumerate(self.outputs):
             try:
-                # Retrieve output from outputs
                 output = output_data["output"]
+                # Convert output if it has '_str_value'
+                if isinstance(output, dict) and '_str_value' in output and len(output) == 1:
+                    output = output['_str_value']
+                
                 # Get the corresponding dataset row
                 dataset_row = self.dataset[idx]
                 input_data = dataset_row.get('input', {})
                 expected_output = dataset_row.get('expected_output', {})
+                
+                # Convert input_data if it has '_str_value'
+                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
+                    input_data = input_data['_str_value']
+
+                # Convert expected_output if it has '_str_value'
+                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
+                    expected_output = expected_output['_str_value']
 
                 # Perform evaluation
                 evaluations_dict = {}
                 for evaluator in evaluators_to_use:
-                    evaluation_result = evaluator(expected_output, output, input_data)
-                    evaluations_dict[evaluator.__name__] = evaluation_result
+                    try:
+                        evaluation_result = evaluator(expected_output, output, input_data)
+                        evaluations_dict[evaluator.__name__] = evaluation_result
+                    except Exception as e:
+                        print(f"Error evaluating row {idx}: {type(e).__name__}: {e}, with evaluator {evaluator.__name__}")
+                        raise e
 
                 # Store evaluation results
                 evaluations.append({
@@ -907,7 +1025,7 @@ def _merge_results(self) -> List[Dict[str, Any]]:
         for idx in range(len(self.outputs)):
             output_data = self.outputs[idx]
             evaluation_data = self.evaluations[idx]
-            dataset_record = self.dataset[idx]
+            dataset_record = self.dataset._data[idx]
 
             merged_result = {
                 "idx": idx,
@@ -929,14 +1047,39 @@ def __len__(self) -> int:
         return len(self.merged_results)
 
     def __getitem__(self, index: int) -> Any:
-        return self.merged_results[index]
+        """Get a result record, converting _str_value dictionaries back to strings.
+        
+        Args:
+            index: Index of the record to retrieve
+            
+        Returns:
+            Dict containing the record with any _str_value values converted to strings
+        """
+        result = self.merged_results[index].copy()
+        
+        # Convert input if it has _str_value
+        if 'input' in result and isinstance(result['input'], dict):
+            if '_str_value' in result['input'] and len(result['input']) == 1:
+                result['input'] = result['input']['_str_value']
+                
+        # Convert expected_output if it has _str_value
+        if 'expected_output' in result and isinstance(result['expected_output'], dict):
+            if '_str_value' in result['expected_output'] and len(result['expected_output']) == 1:
+                result['expected_output'] = result['expected_output']['_str_value']
+                
+        # Convert output if it has _str_value
+        if 'output' in result and isinstance(result['output'], dict):
+            if '_str_value' in result['output'] and len(result['output']) == 1:
+                result['output'] = result['output']['_str_value']
+                
+        return result
 
     def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         """Convert the experiment results to a pandas DataFrame, including the experiment config.
 
         Args:
             multiindex (bool): If True, expand nested dictionaries into MultiIndex columns.
-                               If False, keep the nested dictionaries as they are.
+                            If False, keep the nested dictionaries as they are.
 
         Returns:
             pd.DataFrame: A DataFrame representation of the experiment results.
@@ -952,59 +1095,119 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
                 "Please install it with `pip install pandas`"
             )
 
-        data = []
+        # Define the desired column order
+        COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error']
+        
+        data_rows = []
+        column_tuples = set()
 
         for result in self.merged_results:
             record = {}
+
             if multiindex:
-                # Flatten 'input'
-                for k, v in result['input'].items():
-                    record[('input', k)] = v
-                # Flatten 'expected_output'
-                for k, v in result['expected_output'].items():
-                    record[('expected_output', k)] = v
-                # Flatten 'output'
+                # Handle 'input' fields
+                input_data = result.get('input', {})
+                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
+                    record[('input', '')] = input_data['_str_value']
+                    column_tuples.add(('input', ''))
+                else:
+                    for k, v in input_data.items():
+                        record[('input', k)] = v
+                        column_tuples.add(('input', k))
+
+                # Handle 'expected_output' fields
+                expected_output = result.get('expected_output', {})
+                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
+                    record[('expected_output', '')] = expected_output['_str_value']
+                    column_tuples.add(('expected_output', ''))
+                else:
+                    for k, v in expected_output.items():
+                        record[('expected_output', k)] = v
+                        column_tuples.add(('expected_output', k))
+
+                # Handle 'output' fields
                 output = result.get('output', {})
                 if isinstance(output, dict):
-                    for k, v in output.items():
-                        record[('output', k)] = v
-                # Flatten 'evaluations'
-                for eval_name, eval_result in result['evaluations'].items():
+                    if '_str_value' in output and len(output) == 1:
+                        record[('output', '')] = output['_str_value']
+                        column_tuples.add(('output', ''))
+                    else:
+                        for k, v in output.items():
+                            record[('output', k)] = v
+                            column_tuples.add(('output', k))
+                else:
+                    record[('output', '')] = output
+                    column_tuples.add(('output', ''))
+
+                # Handle 'evaluations' fields
+                evaluations = result.get('evaluations', {})
+                for eval_name, eval_result in evaluations.items():
                     if isinstance(eval_result, dict):
                         for k, v in eval_result.items():
                             record[('evaluations', eval_name, k)] = v
+                            column_tuples.add(('evaluations', eval_name, k))
                     else:
                         record[('evaluations', eval_name)] = eval_result
-                # Flatten 'metadata'
+                        column_tuples.add(('evaluations', eval_name))
+
+                # Handle 'metadata' fields
                 for k, v in result.get('metadata', {}).items():
                     record[('metadata', k)] = v
-                # Include 'config' from the experiment
+                    column_tuples.add(('metadata', k))
+
+                # Handle 'config' fields
                 if self.experiment.config:
                     for k, v in self.experiment.config.items():
                         record[('config', k)] = v
-                # Flatten 'error'
-                error = result['error']
+                        column_tuples.add(('config', k))
+
+                # Handle 'error' fields
+                error = result.get('error', {})
                 if error:
-                    record[('error', 'message')] = error.get('message')
-                    record[('error', 'type')] = error.get('type')
-                    record[('error', 'stack')] = error.get('stack')
-               
+                    for k, v in error.items():
+                        record[('error', k)] = v
+                        column_tuples.add(('error', k))
+
+                data_rows.append(record)
             else:
-                # Keep nested structures
-                record['input'] = result['input']
-                record['expected_output'] = result['expected_output']
-                record['output'] = result.get('output')
-                record['evaluations'] = result.get('evaluations')
-                record['metadata'] = result.get('metadata')
-                record['config'] = self.experiment.config
-                record['error'] = result.get('error')
-            data.append(record)
+                # Non-multiindex implementation remains the same
+                new_record = {}
+                input_data = result.get('input', {})
+                new_record['input'] = (input_data['_str_value'] 
+                                    if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 
+                                    else input_data)
+                expected_output = result.get('expected_output', {})
+                new_record['expected_output'] = (expected_output['_str_value']
+                                            if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1
+                                            else expected_output)
+                output = result.get('output', {})
+                new_record['output'] = (output['_str_value']
+                                    if isinstance(output, dict) and '_str_value' in output and len(output) == 1 
+                                    else output)
+                new_record['evaluations'] = result.get('evaluations', {})
+                new_record['metadata'] = result.get('metadata', {})
+                new_record['config'] = self.experiment.config
+                new_record['error'] = result.get('error', {})
+                data_rows.append(new_record)
 
-        
-        df = pd.DataFrame(data)
         if multiindex:
-            df.columns = pd.MultiIndex.from_tuples(df.columns)
-        return df
+            # Sort column_tuples based on the desired order
+            column_tuples = sorted(list(column_tuples), 
+                                key=lambda x: (COLUMN_ORDER.index(x[0]), x[1:] if len(x) > 1 else ''))
+
+            # Build the DataFrame
+            records_list = []
+            for record in data_rows:
+                row = [record.get(col, None) for col in column_tuples]
+                records_list.append(row)
+
+            df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
+            return df
+        else:
+            df = pd.DataFrame(data_rows)
+            # Reorder columns according to COLUMN_ORDER
+            cols = [col for col in COLUMN_ORDER if col in df.columns]
+            return df[cols]
 
     def push(self, overwrite: bool = False) -> None:
         """Push the experiment results to Datadog.
@@ -1165,6 +1368,8 @@ def push(self, overwrite: bool = False) -> None:
 
                 metrics.append(metric)
 
+
+
         # Prepare payload and send to Datadog
         results_payload = {
             "data": {
@@ -1174,7 +1379,6 @@ def push(self, overwrite: bool = False) -> None:
             }
         }
 
-        print(json.dumps(results_payload, indent=2))
 
         url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
         exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
@@ -1243,7 +1447,7 @@ def task(func):
         raise ValueError("Function name 'task' is reserved. Please use a different name for your task function.")
         
     @wraps(func)
-    def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Any:
+    def wrapper(input: Dict[str, Union[str, Dict[str, Any]]], config: Optional[Dict[str, Any]] = None) -> Any:
         # Call the original function with or without config
         if 'config' in inspect.signature(func).parameters:
             return func(input, config)
@@ -1261,8 +1465,8 @@ def wrapper(input: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> A
 
 def evaluator(func):
     @wraps(func)
-    def wrapper(expected_output: Dict[str, Any], output: Any, input: Dict[str, Any] = None) -> Any:
-            return func(expected_output, output, input)
+    def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict[str, Any]], input: Union[str, Dict[str, Any]] = None) -> Any:
+        return func(expected_output, output, input)
     # Enforce signature compliance
     sig = inspect.signature(func)
     params = sig.parameters

From a228c30a61458d8654ac7c599cfae0d90c8ba540 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Fri, 6 Dec 2024 13:45:39 -0500
Subject: [PATCH 25/36] structure changes

---
 ddtrace/llmobs/__init__.py     |   3 +-
 ddtrace/llmobs/_experiments.py | 819 ++++++++-------------------------
 2 files changed, 194 insertions(+), 628 deletions(-)

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index bd382219754..549f83ad88f 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -12,8 +12,7 @@
 from ._experiments import FileType
 from ._experiments import task
 from ._experiments import evaluator
-from ._experiments import ExperimentGrid
 from ._llmobs import LLMObs
 
 
-__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator", "ExperimentGrid"]
+__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 1b0a3987d06..b53f1c11708 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,5 +1,14 @@
-# TODO: Test failures on eval, how do we set errors
+# TODO: Test failures on eval, how do we set errors, Report null when evaluator fails
 # TODO: Test workflows for re-evals and publishing results
+# TODO: Test pushing experiments without data 
+
+"""
+Test coverage ideas:
+- Define task and evaluator wrong
+- Define experiment wrong
+- Experiments with failures
+- Eval failures
+"""
 
 import concurrent.futures
 from datetime import datetime
@@ -22,13 +31,11 @@
 import ddtrace
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
-BASE_URL = f"https://api.{DD_SITE}"
+BASE_URL = f"https://api.{DD_SITE}" #TODO: Change to https://api.{DD_SITE} when testing is complete in staging
 
 
 class FileType(Enum):
     CSV = 'csv'
-    PARQUET = 'parquet'
-    JSONL = 'jsonl'
 
 
 class Dataset:
@@ -65,26 +72,15 @@ def __len__(self) -> int:
         return len(self._data)
 
     def __getitem__(self, index: int) -> Dict[str, Union[str, Dict[str, Any]]]:
-        """Get a dataset record, converting _str_value dictionaries back to strings.
+        """Get a dataset record.
         
         Args:
             index: Index of the record to retrieve
             
         Returns:
-            Dict containing the record with any _str_value values converted to strings
+            Dict containing the record.
         """
         record = self._data[index].copy()
-        
-        # Convert input if it has _str_value
-        if 'input' in record and isinstance(record['input'], dict):
-            if '_str_value' in record['input'] and len(record['input']) == 1:
-                record['input'] = record['input']['_str_value']
-                
-        # Convert expected_output if it has _str_value
-        if 'expected_output' in record and isinstance(record['expected_output'], dict):
-            if '_str_value' in record['expected_output'] and len(record['expected_output']) == 1:
-                record['expected_output'] = record['expected_output']['_str_value']
-                
         return record
 
     def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> None:
@@ -111,28 +107,6 @@ def _validate_data(self, data: List[Dict[str, Union[str, Dict[str, Any]]]]) -> N
             if set(row.keys()) != first_row_keys:
                 raise ValueError("All rows must have the same keys.")
 
-            # Validate input if present
-            if 'input' in row:
-                if isinstance(row['input'], str):
-                    # Convert string to dict with _str_value key
-                    row['input'] = {'_str_value': row['input']}
-                elif isinstance(row['input'], dict):
-                    # Do nothing
-                    pass
-                else:
-                    raise ValueError("The 'input' field must be either a string or a dictionary")
-
-            # Validate expected_output if present
-            if 'expected_output' in row:
-                if isinstance(row['expected_output'], str):
-                    # Convert string to dict with _str_value key
-                    row['expected_output'] = {'_str_value': row['expected_output']}
-                elif isinstance(row['expected_output'], dict):
-                    # Do nothing
-                    pass
-                else:
-                    raise ValueError("The 'expected_output' field must be either a string or a dictionary")
-
     @classmethod
     def pull(cls, name: str) -> "Dataset":
         """Create a dataset from a dataset hosted in Datadog.
@@ -171,18 +145,6 @@ def pull(cls, name: str) -> "Dataset":
             attrs = record.get("attributes", {})
             input_data = attrs.get("input")
             expected_output = attrs.get("expected_output")
-
-            print(input_data, expected_output)
-            
-            # Handle input data format
-            if isinstance(input_data, str):
-                input_data = {'_str_value': input_data}
-            # For dictionaries, keep as-is (no conversion needed)
-                
-            # Handle expected output format
-            if isinstance(expected_output, str):
-                expected_output = {'_str_value': expected_output}
-            # For dictionaries, keep as-is (no conversion needed)
                 
             class_records.append({
                 "input": input_data,
@@ -254,7 +216,6 @@ def from_csv(
         delimiter: str = ",",
         input_columns: List[str] = None,
         expected_output_columns: List[str] = None,
-        metadata_columns: List[str] = None,
     ) -> "Dataset":
         """Create a Dataset from a CSV file.
 
@@ -265,7 +226,6 @@ def from_csv(
             delimiter: CSV delimiter character, defaults to comma
             input_columns: List of column names to use as input data
             expected_output_columns: List of column names to use as expected output data
-            metadata_columns: Optional list of column names to include as metadata
 
         Returns:
             Dataset: A new Dataset instance containing the CSV data
@@ -289,33 +249,30 @@ def from_csv(
                 header_columns = reader.fieldnames
                 missing_input_columns = [col for col in input_columns if col not in header_columns]
                 missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
-                missing_metadata_columns = []
-                if metadata_columns:
-                    missing_metadata_columns = [col for col in metadata_columns if col not in header_columns]
 
                 if missing_input_columns:
                     raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
                 if missing_output_columns:
                     raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
-                if missing_metadata_columns:
-                    raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")
+
+                # Get metadata columns (all columns not used for input or expected output)
+                metadata_columns = [col for col in header_columns if col not in input_columns and col not in expected_output_columns]
 
                 for row in rows:
-                    # If single column, use string value wrapped in dict
+                    # Handle input data
                     if len(input_columns) == 1:
-                        input_data = {'_str_value': row[input_columns[0]]}
+                        input_data = row[input_columns[0]]
                     else:
                         input_data = {col: row[col] for col in input_columns}
 
-                    # If single column, use string value wrapped in dict
+                    # Handle expected output data
                     if len(expected_output_columns) == 1:
-                        expected_output_data = {'_str_value': row[expected_output_columns[0]]}
+                        expected_output_data = row[expected_output_columns[0]]
                     else:
                         expected_output_data = {col: row[col] for col in expected_output_columns}
 
-                    metadata = {}
-                    if metadata_columns:
-                        metadata = {col: row[col] for col in metadata_columns}
+                    # Handle metadata (all remaining columns)
+                    metadata = {col: row[col] for col in metadata_columns}
 
                     data.append({
                         'input': input_data,
@@ -333,134 +290,6 @@ def from_csv(
 
         return cls(name=name, data=data, description=description)
 
-    @classmethod
-    def _from_jsonl(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
-        """Create a Dataset from a JSONL file.
-
-        Args:
-            filepath: Path to the JSONL file
-            name: Name of the dataset
-            description: Optional description of the dataset
-            input_columns: List of column names to use as input data
-            expected_output_columns: List of column names to use as expected output data
-            metadata_columns: Optional list of column names to include as metadata
-
-        Returns:
-            Dataset: A new Dataset instance containing the JSONL data
-
-        Raises:
-            ValueError: If input_columns or expected_output_columns are not provided
-            Exception: If there are issues reading the JSONL file
-        """
-        if input_columns is None or expected_output_columns is None:
-            raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
-
-        data = []
-        try:
-            with open(filepath, mode='r', encoding='utf-8') as jsonlfile:
-                for line in jsonlfile:
-                    row = json.loads(line.strip())
-
-                    input_data = {col: row.get(col) for col in input_columns}
-                    expected_output_data = {col: row.get(col) for col in expected_output_columns}
-                    metadata = {}
-                    if metadata_columns:
-                        metadata = {col: row.get(col) for col in metadata_columns}
-
-                    data.append({
-                        'input': input_data,
-                        'expected_output': expected_output_data,
-                        **metadata,
-                    })
-
-                if not data:
-                    raise ValueError("JSONL file is empty.")
-
-        except FileNotFoundError as e:
-            raise DatasetFileError(f"JSONL file not found: {filepath}") from e
-        except PermissionError as e:
-            raise DatasetFileError(f"Permission denied when reading JSONL file: {filepath}") from e
-        except json.JSONDecodeError as e:
-            raise DatasetFileError(f"Error parsing JSONL file: {e}") from e
-        except Exception as e:
-            raise DatasetFileError(f"Unexpected error reading JSONL file: {e}") from e
-
-        return cls(name=name, data=data, description=description)
-
-    @classmethod
-    def _from_parquet(cls, filepath: str, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None) -> "Dataset":
-        """Create a Dataset from a Parquet file.
-
-        Args:
-            filepath: Path to the Parquet file
-            name: Name of the dataset
-            description: Optional description of the dataset
-            input_columns: List of column names to use as input data
-            expected_output_columns: List of column names to use as expected output data
-            metadata_columns: Optional list of column names to include as metadata
-
-        Returns:
-            Dataset: A new Dataset instance containing the Parquet data
-
-        Raises:
-            ImportError: If pandas is not installed
-            ValueError: If input_columns or expected_output_columns are not provided,
-                       if the Parquet file is empty, or if specified columns are missing
-            Exception: If there are issues reading the Parquet file
-        """
-        try:
-            import pandas as pd
-        except ImportError:
-            raise ImportError(
-                "pandas is required to read parquet files. "
-                "Please install pandas with: pip install pandas"
-            )
-        
-        if input_columns is None or expected_output_columns is None:
-            raise ValueError("`input_columns` and `expected_output_columns` must be provided.")
-
-        data = []
-        try:
-            df = pd.read_parquet(filepath)
-            if df.empty:
-                raise ValueError("Parquet file is empty.")
-
-            # Ensure that the specified columns are present
-            missing_input_columns = [col for col in input_columns if col not in df.columns]
-            missing_output_columns = [col for col in expected_output_columns if col not in df.columns]
-            missing_metadata_columns = []
-            if metadata_columns:
-                missing_metadata_columns = [col for col in metadata_columns if col not in df.columns]
-
-            if missing_input_columns:
-                raise ValueError(f"Input columns not found in DataFrame: {missing_input_columns}")
-            if missing_output_columns:
-                raise ValueError(f"Expected output columns not found in DataFrame: {missing_output_columns}")
-            if missing_metadata_columns:
-                raise ValueError(f"Metadata columns not found in DataFrame: {missing_metadata_columns}")
-
-            for idx, row in df.iterrows():
-                input_data = {col: row[col] for col in input_columns}
-                expected_output_data = {col: row[col] for col in expected_output_columns}
-                metadata = {}
-                if metadata_columns:
-                    metadata = {col: row[col] for col in metadata_columns}
-
-                data.append({
-                    'input': input_data,
-                    'expected_output': expected_output_data,
-                    **metadata,
-                })
-
-        except FileNotFoundError as e:
-            raise DatasetFileError(f"Parquet file not found: {filepath}") from e
-        except PermissionError as e:
-            raise DatasetFileError(f"Permission denied when reading Parquet file: {filepath}") from e
-        except Exception as e:
-            raise DatasetFileError(f"Error reading Parquet file: {e}") from e
-
-        return cls(name=name, data=data, description=description)
-
     @classmethod
     def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
         """Import a dataset from a file.
@@ -491,24 +320,6 @@ def load(cls, path: str, filetype: FileType, name: str, description: str = "", i
                 expected_output_columns=expected_output_columns,
                 metadata_columns=metadata_columns,
             )
-        elif filetype == FileType.JSONL:
-            return cls._from_jsonl(
-                filepath=path,
-                name=name,
-                description=description,
-                input_columns=input_columns,
-                expected_output_columns=expected_output_columns,
-                metadata_columns=metadata_columns,
-            )
-        elif filetype == FileType.PARQUET:
-            return cls._from_parquet(
-                filepath=path,
-                name=name,
-                description=description,
-                input_columns=input_columns,
-                expected_output_columns=expected_output_columns,
-                metadata_columns=metadata_columns,
-            )
         else:
             raise ValueError(f"Unsupported file type: {filetype}")
 
@@ -541,23 +352,23 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
 
                 # Handle 'input' fields
                 input_data = record.get('input', {})
-                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
-                    flat_record[('input', '')] = input_data['_str_value']
-                    column_tuples.add(('input', ''))
-                else:
+                if isinstance(input_data, dict):        
                     for k, v in input_data.items():
                         flat_record[('input', k)] = v
                         column_tuples.add(('input', k))
+                else:
+                    flat_record[('input', '')] = input_data
+                    column_tuples.add(('input', ''))
 
                 # Handle 'expected_output' fields
                 expected_output = record.get('expected_output', {})
-                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
-                    flat_record[('expected_output', '')] = expected_output['_str_value']
-                    column_tuples.add(('expected_output', ''))
-                else:
+                if isinstance(expected_output, dict):
                     for k, v in expected_output.items():
                         flat_record[('expected_output', k)] = v
                         column_tuples.add(('expected_output', k))
+                else:
+                    flat_record[('expected_output', '')] = expected_output
+                    column_tuples.add(('expected_output', ''))
 
                 # Handle any other top-level fields
                 for k, v in record.items():
@@ -580,18 +391,13 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
             return df
 
         else:
-            # For non-multiindex, convert _str_value in the nested structures
             data = []
             for record in self._data:
                 new_record = {}
                 input_data = record.get('input', {})
-                new_record['input'] = (input_data['_str_value'] 
-                                     if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 
-                                     else input_data)
+                new_record['input'] = input_data
                 expected_output = record.get('expected_output', {})
-                new_record['expected_output'] = (expected_output['_str_value']
-                                               if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1
-                                               else expected_output)
+                new_record['expected_output'] = expected_output
                 # Copy other fields
                 for k, v in record.items():
                     if k not in ['input', 'expected_output']:
@@ -676,179 +482,103 @@ def __init__(
     def run_task(
         self,
         _jobs: int = 10,
-        _timeout: Optional[float] = None,
-        _retries: int = 0,
-        _max_delay: float = 60.0,
-        raise_on_error: bool = False,
+        raise_errors: bool = False,
     ) -> None:
         """Execute the task function on the dataset and store the outputs.
 
         Args:
             _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10.
-            timeout: Maximum time in seconds to wait for each task execution. 
-                    If None, will wait indefinitely. Defaults to None.
-            retries: Number of retry attempts for failed tasks. Defaults to 0.
-            max_delay: Maximum delay in seconds between retries using exponential backoff.
-                      Defaults to 60 seconds.
-            raise_on_error: If True, raises exceptions from failed tasks. If False, stores
+            raise_errors: If True, raises exceptions from failed tasks. If False, stores
                           errors in the output. Defaults to False.
 
         Raises:
-            ValueError: If _jobs is not between 1 and 20, or if retries is negative.
+            ValueError: If _jobs is not between 1 and 20
         """
         if not 1 <= _jobs <= 20:
             raise ValueError("Number of jobs must be between 1 and 20")
-        if _retries < 0:
-            raise ValueError("Number of retries must be non-negative")
+        
         self.outputs = []
         total_rows = len(self.dataset)
         completed = 0
+        error_count = 0 
+        error_messages = []  
 
         def process_row(idx_row):
             idx, row = idx_row
-            attempt = 0
-            delay = 1.0  # Initial delay in seconds
-
-            while attempt <= _retries:
-                start_time = time.time()
-                try:
-                    # Extract the input data and convert if it's a _str_value dict
-                    input_data = row['input']
-                    if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
-                        input_data = input_data['_str_value']
-
-                    def execute_task():
-                        if getattr(self.task, '_accepts_config', False):
-                            return self.task(input_data, self.config)
-                        return self.task(input_data)
-
-                    # Use ThreadPoolExecutor to enforce timeout
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as single_executor:
-                        future = single_executor.submit(execute_task)
-                        output = future.result(timeout=_timeout)
-
-                    # Ensure output is a dictionary with _str_value for strings
-                    if isinstance(output, str):
-                        output = {'_str_value': output}
-                    elif not isinstance(output, dict):
-                        output = {'value': output}
-
-                    # Prepare output data
-                    output_data = {
-                        "idx": idx,
-                        "output": output,
-                        "metadata": {
-                            "timestamp": start_time,
-                            "duration": time.time() - start_time,
-                            "dataset_record_idx": idx,
-                            "project_name": self.project_name,
-                            "experiment_name": self.name,
-                            "dataset_name": self.dataset.name,
-                        },
-                        "error": {
-                            "message": None,
-                            "stack": None,
-                            "type": None,
-                        }
+            start_time = time.time()
+            try:
+                input_data = row['input']
+                
+                if getattr(self.task, '_accepts_config', False):
+                    output = self.task(input_data, self.config)
+                else:
+                    output = self.task(input_data)
+
+                output_data = {
+                    "idx": idx,
+                    "output": output,
+                    "metadata": {
+                        "timestamp": start_time,
+                        "duration": time.time() - start_time,
+                        "dataset_record_idx": idx,
+                        "project_name": self.project_name,
+                        "experiment_name": self.name,
+                        "dataset_name": self.dataset.name,
+                    },
+                    "error": {
+                        "message": None,
+                        "stack": None,
+                        "type": None,
                     }
-                    return output_data
-
-                except concurrent.futures.TimeoutError as e:
-                    print(f"Timeout error: {e}")
-                    if raise_on_error:
-                        # Raise specific experiment task error
-                        raise ExperimentTaskError(f"Task timed out after {_timeout} seconds", idx, e)
-                    if attempt < _retries:
-                        # Exponential backoff and retry
-                        sleep_time = min(delay, _max_delay)
-                        time.sleep(sleep_time)
-                        delay *= 2
-                        attempt += 1
-                    else:
-                        # All retries exhausted, record the timeout error
-                        output_data = {
-                            "idx": idx,
-                            "output": None,
-                            "metadata": {
-                                "timestamp": start_time,
-                                "duration": time.time() - start_time,
-                                "dataset_record_idx": idx,
-                                "project_name": self.project_name,
-                                "experiment_name": self.name,
-                                "dataset_name": self.dataset.name,
-                            },
-                            "error": {
-                                "message": f"Task timed out after {_timeout} seconds",
-                                "stack": None,
-                                "type": "TimeoutError",
-                            }
-                        }
-                        return output_data
-
-                except Exception as e:
-                    print(f"Error: {e}")
-                    if raise_on_error:
-                        # Raise specific experiment task error
-                        raise ExperimentTaskError(str(e), idx, e)
-                    if attempt < _retries:
-                        # Exponential backoff and retry
-                        sleep_time = min(delay, _max_delay)
-                        time.sleep(sleep_time)
-                        delay *= 2
-                        attempt += 1
-                    else:
-                        # All retries exhausted, record the error
-                        output_data = {
-                            "idx": idx,
-                            "output": None,
-                            "metadata": {
-                                "timestamp": start_time,
-                                "duration": time.time() - start_time,
-                                "dataset_record_idx": idx,
-                                "project_name": self.project_name,
-                                "experiment_name": self.name,
-                                "dataset_name": self.dataset.name,
-                            },
-                            "error": {
-                                "message": str(e),
-                                "stack": None,
-                                "type": type(e).__name__,
-                            }
-                        }
-                        return output_data
+                }
+                return output_data
 
-        # Initialize the progress bar
-        _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
+            except Exception as e:
+                error_message = str(e)
+                error_messages.append(f"Row {idx}: {error_message}")
+                return {
+                    "idx": idx,
+                    "output": None,
+                    "metadata": {
+                        "timestamp": start_time,
+                        "duration": time.time() - start_time,
+                        "dataset_record_idx": idx,
+                        "project_name": self.project_name,
+                        "experiment_name": self.name,
+                        "dataset_name": self.dataset.name,
+                    },
+                    "error": {
+                        "message": error_message,
+                        "stack": None,
+                        "type": type(e).__name__,
+                    }
+                }
 
-        # Use a flag to determine if an error occurred
-        error_occurred = False
-        error_exception = None
+        _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            # Submit the process_row function to the executor for each dataset record
             futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
-
             outputs_buffer = [None] * total_rows
+
             try:
                 for future in concurrent.futures.as_completed(futures):
                     idx = futures[future]
-                    start_time = time.time()
                     try:
                         output_data = future.result()
                         outputs_buffer[idx] = output_data
-                        if raise_on_error and output_data['error']['message']:
-                            # An error occurred; cancel all futures
-                            error_occurred = True
-                            error_exception = Exception(f"Task failed on row {idx}: {output_data['error']['message']}")
-                            break
+                        if raise_errors and output_data['error']['message']:
+                            error_message = output_data['error']['message']
+                            raise ExperimentTaskError(error_message, idx, output_data['error']['type'])
+                        elif output_data['error']['message']:
+                            error_count += 1
+
                     except Exception as e:
-                        print(f"Error: {e}")
                         outputs_buffer[idx] = {
                             "idx": idx,
                             "output": None,
                             "metadata": {
-                                "timestamp": start_time,
-                                "duration": time.time() - start_time,
+                                "timestamp": time.time(),
+                                "duration": 0,
                                 "dataset_record_idx": idx,
                                 "project_name": self.project_name,
                                 "experiment_name": self.name,
@@ -860,38 +590,39 @@ def execute_task():
                                 "type": type(e).__name__,
                             }
                         }
-                        if raise_on_error:
-                            # An exception occurred; cancel all futures
-                            error_occurred = True
-                            error_exception = e
-                            break
+                        if raise_errors:
+                            raise e
+                        else:
+                            error_count += 1
+                            error_messages.append(f"Row {idx}: {str(e)}")
+
                     completed += 1
                     _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
-            finally:
-                if error_occurred:
-                    print(f"Error occurred: {error_exception}")
-                    # Cancel all pending futures
-                    for future in futures:
-                        future.cancel()
-                    # Shutdown the executor immediately
-                    executor.shutdown(wait=False)
-                    raise error_exception
+
+            except Exception as e:
+                for future in futures:
+                    future.cancel()
+                executor.shutdown(wait=False)
+                raise e
 
         self.outputs = outputs_buffer
         self.has_run = True
 
-        # Log error statistics if any errors occurred
-        error_count = sum(1 for output in self.outputs if output['error']['message'] is not None)
+        error_rate = (error_count / total_rows) * 100
+        print(f"\nTask completed with {error_count} errors ({error_rate:.2f}% error rate)")
+
         if error_count > 0:
-            error_rate = (error_count / total_rows) * 100
-            print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
+            print("\nError Summary:")
+            for error_msg in error_messages:
+                print(f"- {error_msg}")
+            print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n")
 
-    def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_error: bool = False) -> "ExperimentResults":
+    def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults":
         """Run evaluators on the outputs and return ExperimentResults.
         
         Args:
             evaluators (Optional[List[Callable]]): List of evaluators to use. If None, uses the experiment's evaluators.
-            raise_on_error (bool): If True, raises exceptions encountered during evaluation.
+            raise_errors (bool): If True, raises exceptions encountered during evaluation.
         
         Returns:
             ExperimentResults: A new ExperimentResults instance with the evaluation results.
@@ -913,40 +644,31 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_
         evaluations = []
         total_rows = len(self.outputs)
         completed = 0
+        error_count = 0  
+        error_messages = [] 
 
         _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete')
 
         for idx, output_data in enumerate(self.outputs):
             try:
                 output = output_data["output"]
-                # Convert output if it has '_str_value'
-                if isinstance(output, dict) and '_str_value' in output and len(output) == 1:
-                    output = output['_str_value']
-                
-                # Get the corresponding dataset row
+
                 dataset_row = self.dataset[idx]
                 input_data = dataset_row.get('input', {})
                 expected_output = dataset_row.get('expected_output', {})
-                
-                # Convert input_data if it has '_str_value'
-                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
-                    input_data = input_data['_str_value']
 
-                # Convert expected_output if it has '_str_value'
-                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
-                    expected_output = expected_output['_str_value']
-
-                # Perform evaluation
                 evaluations_dict = {}
                 for evaluator in evaluators_to_use:
                     try:
                         evaluation_result = evaluator(expected_output, output, input_data)
                         evaluations_dict[evaluator.__name__] = evaluation_result
                     except Exception as e:
-                        print(f"Error evaluating row {idx}: {type(e).__name__}: {e}, with evaluator {evaluator.__name__}")
-                        raise e
+                        error_count += 1
+                        error_message = f"Row {idx}, Evaluator {evaluator.__name__}: {type(e).__name__}: {e}"
+                        error_messages.append(error_message)
+                        if raise_errors:
+                            raise e
 
-                # Store evaluation results
                 evaluations.append({
                     "idx": idx,
                     "evaluations": evaluations_dict,
@@ -954,8 +676,11 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_
                 })
 
             except Exception as e:
-                if raise_on_error:
+                if raise_errors:
                     raise e
+                error_count += 1
+                error_message = f"Row {idx}: {type(e).__name__}: {e}"
+                error_messages.append(error_message)
                 evaluations.append({
                     "idx": idx,
                     "evaluations": {},
@@ -969,33 +694,38 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_on_
             completed += 1
             _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete')
 
-        # Return new ExperimentResults without modifying the experiment's state
+        error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100
+        print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)")
+
+        if error_count > 0:
+           
+            print("\nError Summary:")
+            for error_msg in error_messages:
+                print(f"- {error_msg}")
+            print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n")
+      
+        self.has_evaluated = True
         return ExperimentResults(self.dataset, self, self.outputs, evaluations)
 
     def run(
         self,
         _jobs: int = 10,
-        timeout: Optional[float] = None,
-        retries: int = 0,
-        max_delay: float = 60.0,
-        raise_on_error: bool = False,
+        raise_errors: bool = False,
     ) -> "ExperimentResults":
         """Execute the task and evaluations, returning the results.
 
         Args:
             _jobs (int): Number of worker threads.
             timeout (float, optional): Time limit for the task execution in seconds.
-            retries (int): Number of retries for failed tasks.
-            max_delay (float): Maximum delay between retries in seconds.
-            raise_on_error (bool): If True, raises exceptions from failed tasks. If False, stores
-                                  errors in the output. Defaults to False.
+            raise_errors (bool): If True, raises exceptions from failed tasks. If False, stores
+                                errors in the output. Defaults to False.
 
         Returns:
             ExperimentResults: The results of the experiment.
         """
-        self.run_task(_jobs=_jobs, _timeout=timeout, _retries=retries, _max_delay=max_delay, raise_on_error=raise_on_error)
-        experiment_results = self.run_evaluations(raise_on_error=raise_on_error)
-        print()  # Move to the next line after completion
+        self.run_task(_jobs=_jobs, raise_errors=raise_errors)
+        experiment_results = self.run_evaluations(raise_errors=raise_errors)
+        print()  
         return experiment_results
 
 
@@ -1047,31 +777,15 @@ def __len__(self) -> int:
         return len(self.merged_results)
 
     def __getitem__(self, index: int) -> Any:
-        """Get a result record, converting _str_value dictionaries back to strings.
+        """Get a result record.
         
         Args:
             index: Index of the record to retrieve
             
         Returns:
-            Dict containing the record with any _str_value values converted to strings
+            Dict containing the record.
         """
         result = self.merged_results[index].copy()
-        
-        # Convert input if it has _str_value
-        if 'input' in result and isinstance(result['input'], dict):
-            if '_str_value' in result['input'] and len(result['input']) == 1:
-                result['input'] = result['input']['_str_value']
-                
-        # Convert expected_output if it has _str_value
-        if 'expected_output' in result and isinstance(result['expected_output'], dict):
-            if '_str_value' in result['expected_output'] and len(result['expected_output']) == 1:
-                result['expected_output'] = result['expected_output']['_str_value']
-                
-        # Convert output if it has _str_value
-        if 'output' in result and isinstance(result['output'], dict):
-            if '_str_value' in result['output'] and len(result['output']) == 1:
-                result['output'] = result['output']['_str_value']
-                
         return result
 
     def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
@@ -1105,36 +819,29 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
             record = {}
 
             if multiindex:
-                # Handle 'input' fields
                 input_data = result.get('input', {})
-                if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1:
-                    record[('input', '')] = input_data['_str_value']
-                    column_tuples.add(('input', ''))
-                else:
+                if isinstance(input_data, dict):
                     for k, v in input_data.items():
                         record[('input', k)] = v
                         column_tuples.add(('input', k))
+                else:
+                    record[('input', '')] = input_data
+                    column_tuples.add(('input', ''))
 
-                # Handle 'expected_output' fields
                 expected_output = result.get('expected_output', {})
-                if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1:
-                    record[('expected_output', '')] = expected_output['_str_value']
-                    column_tuples.add(('expected_output', ''))
-                else:
+                if isinstance(expected_output, dict):
                     for k, v in expected_output.items():
                         record[('expected_output', k)] = v
                         column_tuples.add(('expected_output', k))
+                else:
+                    record[('expected_output', '')] = expected_output
+                    column_tuples.add(('expected_output', ''))
 
-                # Handle 'output' fields
                 output = result.get('output', {})
                 if isinstance(output, dict):
-                    if '_str_value' in output and len(output) == 1:
-                        record[('output', '')] = output['_str_value']
-                        column_tuples.add(('output', ''))
-                    else:
-                        for k, v in output.items():
-                            record[('output', k)] = v
-                            column_tuples.add(('output', k))
+                    for k, v in output.items():
+                        record[('output', k)] = v
+                        column_tuples.add(('output', k))
                 else:
                     record[('output', '')] = output
                     column_tuples.add(('output', ''))
@@ -1173,17 +880,11 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
                 # Non-multiindex implementation remains the same
                 new_record = {}
                 input_data = result.get('input', {})
-                new_record['input'] = (input_data['_str_value'] 
-                                    if isinstance(input_data, dict) and '_str_value' in input_data and len(input_data) == 1 
-                                    else input_data)
+                new_record['input'] = input_data
                 expected_output = result.get('expected_output', {})
-                new_record['expected_output'] = (expected_output['_str_value']
-                                            if isinstance(expected_output, dict) and '_str_value' in expected_output and len(expected_output) == 1
-                                            else expected_output)
+                new_record['expected_output'] = expected_output
                 output = result.get('output', {})
-                new_record['output'] = (output['_str_value']
-                                    if isinstance(output, dict) and '_str_value' in output and len(output) == 1 
-                                    else output)
+                new_record['output'] = output
                 new_record['evaluations'] = result.get('evaluations', {})
                 new_record['metadata'] = result.get('metadata', {})
                 new_record['config'] = self.experiment.config
@@ -1209,7 +910,7 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
             cols = [col for col in COLUMN_ORDER if col in df.columns]
             return df[cols]
 
-    def push(self, overwrite: bool = False) -> None:
+    def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
         """Push the experiment results to Datadog.
 
         Raises:
@@ -1248,61 +949,30 @@ def push(self, overwrite: bool = False) -> None:
         else:
             project_id = projects[0]["id"]
 
-        # Check if experiment exists
-        encoded_name = quote(self.experiment.name)
-        url = f"/api/unstable/llm-obs/v1/experiments?filter[name]={encoded_name}"
-        resp = exp_http_request("GET", url)
-        response_data = resp.json()
-        experiments = response_data.get("data", [])
-
-        if not experiments:
-            # Create new experiment
-            experiment_payload = {
-                "data": {
-                    "type": "experiments",
-                    "attributes": {
-                        "name": self.experiment.name,
-                        "description": self.experiment.description,
-                        "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                        "project_id": project_id,
-                        "metadata": {
-                            "tags": self.experiment.tags,
-                            **self.experiment.metadata,
-                            "config": self.experiment.config,
-                        },
-                    },
-                }
-            }
-            resp = exp_http_request(
-                "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
-            )
-            response_data = resp.json()
-            experiment_id = response_data["data"]["id"]
-        else:
-            # Experiment exists, create a new version
-            version_suffix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-            new_experiment_name = f"{self.experiment.name}-{version_suffix}"
-            experiment_payload = {
-                "data": {
-                    "type": "experiments",
-                    "attributes": {
-                        "name": new_experiment_name,
-                        "description": self.experiment.description,
-                        "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                        "project_id": project_id,
-                        "metadata": {
-                            **self.experiment.metadata,
-                            "config": self.experiment.config,
-                        },
+        # Create new experiment
+        experiment_payload = {
+            "data": {
+                "type": "experiments",
+                "attributes": {
+                    "name": self.experiment.name,
+                    "description": self.experiment.description,
+                    "dataset_id": self.experiment.dataset._datadog_dataset_id,
+                    "project_id": project_id,
+                    "metadata": {
+                        "tags": self.experiment.tags,
+                        **(self.experiment.metadata or {}),
+                        "config": self.experiment.config,
                     },
-                }
+                    "ensure_unique": True, # Generates a new experiment with a unique name if the experiment name already exists
+                },
             }
-            resp = exp_http_request(
-                "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
-            )
-            response_data = resp.json()
-            experiment_id = response_data["data"]["id"]
-            self.experiment.name = new_experiment_name
+        }
+        resp = exp_http_request(
+            "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
+        )
+        response_data = resp.json()
+        experiment_id = response_data["data"]["id"]
+        self.experiment.name = response_data["data"]["attributes"]["name"]
 
         spans = []
         metrics = []
@@ -1431,7 +1101,10 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
     full_url = BASE_URL + url
     resp = http_request(method, full_url, headers=headers, body=body)
     if resp.status_code == 403:
-        raise ValueError("API key or application key is incorrect.")
+        if DD_SITE != "datadoghq.com":
+            raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.")
+        else:
+            raise ValueError("API key or application key is incorrect.")
     if resp.status_code >= 400:
         try:
             error_details = resp.json()
@@ -1470,7 +1143,7 @@ def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict
     # Enforce signature compliance
     sig = inspect.signature(func)
     params = sig.parameters
-    required_params = ['expected_output', 'output', 'input']
+    required_params = ['input', 'output', 'expected_output']
     if not all(param in params for param in required_params):
         raise TypeError(f"Evaluator function must have parameters {required_params}.")
     wrapper._is_evaluator = True  # Set attribute to indicate decoration
@@ -1481,117 +1154,10 @@ def _print_progress_bar(iteration, total, prefix='', suffix='', decimals=1, leng
     percent = f"{100 * (iteration / float(total)):.{decimals}f}"
     filled_length = int(length * iteration // total)
     bar = fill * filled_length + '-' * (length - filled_length)
-    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r')
+    # Use carriage return '\r' to overwrite the line
+    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end='\r', flush=True)
     if iteration == total:
-        print()
-
-
-class ExperimentGrid:
-    """Class to run a grid of experiments over multiple parameter combinations.
-
-    Attributes:
-        name (str): Name of the experiment grid.
-        task (Callable): The task function to execute.
-        dataset (Dataset): The dataset to use.
-        evaluators (List[Callable]): List of evaluator functions.
-        config (Dict[str, List[Any]]): Parameter grid to run over.
-        tags (List[str]): List of tags.
-        project_name (str): Name of the project.
-        description (str): Description of the experiment grid.
-        metadata (Dict[str, Any]): Metadata dictionary.
-        experiments (List[Experiment]): List of experiments created.
-        results (List[ExperimentResults]): List of corresponding results.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        task: Callable,
-        dataset: Dataset,
-        evaluators: List[Callable],
-        config: Dict[str, List[Any]],
-        tags: List[str] = [],
-        project_name: str = "-",
-        description: str = "",
-        metadata: Dict[str, Any] = {},
-    ) -> None:
-        self.name = name
-        self.task = task
-        self.dataset = dataset
-        self.evaluators = evaluators
-        self.config = config
-        self.tags = tags
-        self.project_name = project_name
-        self.description = description
-        self.metadata = metadata
-        self.experiments = []
-        self.results = []
-
-        # Generate all parameter combinations and create experiments
-        self._generate_experiments()
-
-    def _generate_experiments(self):
-        keys, values = zip(*self.config.items())
-        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
-
-        for params in param_combinations:
-            # Create config for the experiment
-            config = params.copy()
-
-            # Compute hash of the config
-            config_str = json.dumps(config, sort_keys=True)
-            config_hash = hashlib.md5(config_str.encode('utf-8')).hexdigest()
-            config_hash_tag = f"config_hash:{config_hash}"
-
-            # Generate a unique name for each experiment
-            experiment_name = f"{self.name}_" + "_".join(f"{k}_{v}" for k, v in params.items())
-
-            # Create tags for parameters
-            param_tags = [f"{k}:{v}" for k, v in params.items()] + [config_hash_tag]
-
-            # Create a new experiment instance with updated config and name
-            experiment = Experiment(
-                name=experiment_name,
-                task=self.task,
-                dataset=self.dataset,
-                evaluators=self.evaluators,
-                tags=self.tags + param_tags,
-                project_name=self.project_name,
-                description=self.description,
-                metadata={**self.metadata, "config": config},
-                config=config,
-            )
-
-            # Add the experiment to the list without running it
-            self.experiments.append(experiment)
-
-    def __len__(self):
-        return len(self.experiments)
-
-    def __getitem__(self, index):
-        return self.experiments[index]
-
-    # Update the run method to use the pre-generated experiments
-    def run(self, _jobs: int = 10):
-        """Run experiments for all combinations of parameters in the grid.
-
-        Args:
-            _jobs (int): Number of parallel workers for each experiment run.
-        """
-        for experiment in self.experiments:
-            results = experiment.run(_jobs=_jobs)
-            self.results.append(results)
-
-        return self.results
-
-    def get_all_results(self) -> List[ExperimentResults]:
-        """Return all results from the experiment grid.
-
-        Returns:
-            List[ExperimentResults]: A list of results for each experiment.
-        """
-        return self.results
-
+        print()  # Move to the next line after completion
 
 class DatasetFileError(Exception):
     """Exception raised when there are errors reading or processing dataset files."""
@@ -1603,4 +1169,5 @@ class ExperimentTaskError(Exception):
     def __init__(self, message: str, row_idx: int, original_error: Exception = None):
         self.row_idx = row_idx
         self.original_error = original_error
-        super().__init__(f"Task failed on row {row_idx}: {message}")
+        super().__init__(message)
+

From b29fa1def19eef78b38282ffe73154bab11dcca6 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 9 Dec 2024 15:47:59 -0500
Subject: [PATCH 26/36] modifications to types

---
 ddtrace/llmobs/__init__.py     |  4 +---
 ddtrace/llmobs/_experiments.py | 42 +++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index 549f83ad88f..f2e91e8a1ca 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -8,11 +8,9 @@
 
 from ._experiments import Dataset
 from ._experiments import Experiment
-from ._experiments import ExperimentResults
-from ._experiments import FileType
 from ._experiments import task
 from ._experiments import evaluator
 from ._llmobs import LLMObs
 
 
-__all__ = ["LLMObs", "Dataset", "Experiment", "ExperimentResults", "FileType", "task", "evaluator"]
+__all__ = ["LLMObs", "Dataset", "Experiment", "task", "evaluator"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index b53f1c11708..0af191bb9bb 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,7 +1,3 @@
-# TODO: Test failures on eval, how do we set errors, Report null when evaluator fails
-# TODO: Test workflows for re-evals and publishing results
-# TODO: Test pushing experiments without data 
-
 """
 Test coverage ideas:
 - Define task and evaluator wrong
@@ -10,20 +6,18 @@
 - Eval failures
 """
 
+import csv
 import concurrent.futures
-from datetime import datetime
+from enum import Enum
+from functools import wraps
+import hashlib
+import inspect
 import json
 import os
 import time
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
-import inspect
-from functools import wraps
 from urllib.parse import quote
 import uuid
-import csv
-from enum import Enum
-import itertools
-import hashlib
 
 from ._utils import HTTPResponse
 from ._utils import http_request
@@ -31,8 +25,10 @@
 import ddtrace
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
-BASE_URL = f"https://api.{DD_SITE}" #TODO: Change to https://api.{DD_SITE} when testing is complete in staging
-
+if DD_SITE == "datadoghq.com":
+    BASE_URL = f"https://api.{DD_SITE}"
+else:
+    BASE_URL = f"https://{DD_SITE}"
 
 class FileType(Enum):
     CSV = 'csv'
@@ -660,7 +656,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
                 evaluations_dict = {}
                 for evaluator in evaluators_to_use:
                     try:
-                        evaluation_result = evaluator(expected_output, output, input_data)
+                        evaluation_result = evaluator(input_data, output, expected_output)
                         evaluations_dict[evaluator.__name__] = evaluation_result
                     except Exception as e:
                         error_count += 1
@@ -694,7 +690,11 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
             completed += 1
             _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete')
 
-        error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100
+        if len(evaluators_to_use) > 0:  
+            error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100
+        else:
+            error_rate = 0
+
         print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)")
 
         if error_count > 0:
@@ -1101,10 +1101,11 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
     full_url = BASE_URL + url
     resp = http_request(method, full_url, headers=headers, body=body)
     if resp.status_code == 403:
-        if DD_SITE != "datadoghq.com":
+        if not DD_SITE:
             raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.")
         else:
-            raise ValueError("API key or application key is incorrect.")
+            print(resp.text())
+            raise ValueError("DD_API_KEY or DD_APPLICATION_KEY is incorrect.")
     if resp.status_code >= 400:
         try:
             error_details = resp.json()
@@ -1138,8 +1139,8 @@ def wrapper(input: Dict[str, Union[str, Dict[str, Any]]], config: Optional[Dict[
 
 def evaluator(func):
     @wraps(func)
-    def wrapper(expected_output: Union[str, Dict[str, Any]], output: Union[str, Dict[str, Any]], input: Union[str, Dict[str, Any]] = None) -> Any:
-        return func(expected_output, output, input)
+    def wrapper(input: Union[str, Dict[str, Any]] = None, output: Union[str, Dict[str, Any]] = None, expected_output: Union[str, Dict[str, Any]] = None) -> Any:
+        return func(input, output, expected_output)
     # Enforce signature compliance
     sig = inspect.signature(func)
     params = sig.parameters
@@ -1169,5 +1170,4 @@ class ExperimentTaskError(Exception):
     def __init__(self, message: str, row_idx: int, original_error: Exception = None):
         self.row_idx = row_idx
         self.original_error = original_error
-        super().__init__(message)
-
+        super().__init__(message)
\ No newline at end of file

From 738cc07dd953a360f8086c485f7a859727e335f8 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 9 Dec 2024 15:49:13 -0500
Subject: [PATCH 27/36] remove unnecessary comments

---
 ddtrace/llmobs/_experiments.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 0af191bb9bb..311dbf9dd2e 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -1,11 +1,3 @@
-"""
-Test coverage ideas:
-- Define task and evaluator wrong
-- Define experiment wrong
-- Experiments with failures
-- Eval failures
-"""
-
 import csv
 import concurrent.futures
 from enum import Enum

From 105917276cd706f2eb86f608cdabca19ce8e869e Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 9 Dec 2024 15:52:25 -0500
Subject: [PATCH 28/36] fix code quality violations

---
 ddtrace/llmobs/_experiments.py | 41 +++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 311dbf9dd2e..323a20b3bf4 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -308,8 +308,8 @@ def load(cls, path: str, filetype: FileType, name: str, description: str = "", i
                 expected_output_columns=expected_output_columns,
                 metadata_columns=metadata_columns,
             )
-        else:
-            raise ValueError(f"Unsupported file type: {filetype}")
+        
+        raise ValueError(f"Unsupported file type: {filetype}")
 
     def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         """Convert the dataset to a pandas DataFrame.
@@ -378,20 +378,19 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
 
             return df
 
-        else:
-            data = []
-            for record in self._data:
-                new_record = {}
-                input_data = record.get('input', {})
-                new_record['input'] = input_data
-                expected_output = record.get('expected_output', {})
-                new_record['expected_output'] = expected_output
-                # Copy other fields
-                for k, v in record.items():
-                    if k not in ['input', 'expected_output']:
-                        new_record[k] = v
-                data.append(new_record)
-            return pd.DataFrame(data)
+        data = []
+        for record in self._data:
+            new_record = {}
+            input_data = record.get('input', {})
+            new_record['input'] = input_data
+            expected_output = record.get('expected_output', {})
+            new_record['expected_output'] = expected_output
+            # Copy other fields
+            for k, v in record.items():
+                if k not in ['input', 'expected_output']:
+                    new_record[k] = v
+            data.append(new_record)
+        return pd.DataFrame(data)
 
     def export_to_jsonl(self, file_path):
         """
@@ -896,11 +895,11 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
 
             df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
             return df
-        else:
-            df = pd.DataFrame(data_rows)
-            # Reorder columns according to COLUMN_ORDER
-            cols = [col for col in COLUMN_ORDER if col in df.columns]
-            return df[cols]
+
+        df = pd.DataFrame(data_rows)
+        # Reorder columns according to COLUMN_ORDER
+        cols = [col for col in COLUMN_ORDER if col in df.columns]
+        return df[cols]
 
     def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
         """Push the experiment results to Datadog.

From 965bdcbdc9e7c2e19524663af14122a7aa8143aa Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Fri, 13 Dec 2024 11:09:58 -0500
Subject: [PATCH 29/36] add test comments

---
 ddtrace/llmobs/_experiments.py          | 216 +++++++++++-------------
 tests/llmobs/test_llmobs_experiments.py |   9 +
 2 files changed, 103 insertions(+), 122 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 323a20b3bf4..c576827b602 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -193,7 +193,7 @@ def push(self) -> None:
         data = resp.json()
 
         # Print url to the dataset in Datadog
-        print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}")
+        print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n\n")
 
     @classmethod
     def from_csv(
@@ -488,7 +488,6 @@ def run_task(
         total_rows = len(self.dataset)
         completed = 0
         error_count = 0 
-        error_messages = []  
 
         def process_row(idx_row):
             idx, row = idx_row
@@ -522,7 +521,6 @@ def process_row(idx_row):
 
             except Exception as e:
                 error_message = str(e)
-                error_messages.append(f"Row {idx}: {error_message}")
                 return {
                     "idx": idx,
                     "output": None,
@@ -581,7 +579,6 @@ def process_row(idx_row):
                             raise e
                         else:
                             error_count += 1
-                            error_messages.append(f"Row {idx}: {str(e)}")
 
                     completed += 1
                     _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
@@ -596,13 +593,9 @@ def process_row(idx_row):
         self.has_run = True
 
         error_rate = (error_count / total_rows) * 100
-        print(f"\nTask completed with {error_count} errors ({error_rate:.2f}% error rate)")
-
+        print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
         if error_count > 0:
-            print("\nError Summary:")
-            for error_msg in error_messages:
-                print(f"- {error_msg}")
-            print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n")
+            print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.")
 
     def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults":
         """Run evaluators on the outputs and return ExperimentResults.
@@ -632,7 +625,6 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
         total_rows = len(self.outputs)
         completed = 0
         error_count = 0  
-        error_messages = [] 
 
         _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete')
 
@@ -651,8 +643,6 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
                         evaluations_dict[evaluator.__name__] = evaluation_result
                     except Exception as e:
                         error_count += 1
-                        error_message = f"Row {idx}, Evaluator {evaluator.__name__}: {type(e).__name__}: {e}"
-                        error_messages.append(error_message)
                         if raise_errors:
                             raise e
 
@@ -666,8 +656,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
                 if raise_errors:
                     raise e
                 error_count += 1
-                error_message = f"Row {idx}: {type(e).__name__}: {e}"
-                error_messages.append(error_message)
+            
                 evaluations.append({
                     "idx": idx,
                     "evaluations": {},
@@ -686,14 +675,10 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
         else:
             error_rate = 0
 
-        print(f"\nEvaluation completed with {error_count} errors ({error_rate:.2f}% error rate)")
+        print(f"Evaluation completed with {error_count} errors ({error_rate:.2f}% error rate)")
 
         if error_count > 0:
-           
-            print("\nError Summary:")
-            for error_msg in error_messages:
-                print(f"- {error_msg}")
-            print("\nIf you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.\n")
+            print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.")
       
         self.has_evaluated = True
         return ExperimentResults(self.dataset, self, self.outputs, evaluations)
@@ -716,7 +701,6 @@ def run(
         """
         self.run_task(_jobs=_jobs, raise_errors=raise_errors)
         experiment_results = self.run_evaluations(raise_errors=raise_errors)
-        print()  
         return experiment_results
 
 
@@ -803,103 +787,86 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         # Define the desired column order
         COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error']
         
-        data_rows = []
-        column_tuples = set()
-
-        for result in self.merged_results:
-            record = {}
-
-            if multiindex:
-                input_data = result.get('input', {})
-                if isinstance(input_data, dict):
-                    for k, v in input_data.items():
-                        record[('input', k)] = v
-                        column_tuples.add(('input', k))
-                else:
-                    record[('input', '')] = input_data
-                    column_tuples.add(('input', ''))
-
-                expected_output = result.get('expected_output', {})
-                if isinstance(expected_output, dict):
-                    for k, v in expected_output.items():
-                        record[('expected_output', k)] = v
-                        column_tuples.add(('expected_output', k))
-                else:
-                    record[('expected_output', '')] = expected_output
-                    column_tuples.add(('expected_output', ''))
-
-                output = result.get('output', {})
-                if isinstance(output, dict):
-                    for k, v in output.items():
-                        record[('output', k)] = v
-                        column_tuples.add(('output', k))
-                else:
-                    record[('output', '')] = output
-                    column_tuples.add(('output', ''))
-
-                # Handle 'evaluations' fields
-                evaluations = result.get('evaluations', {})
-                for eval_name, eval_result in evaluations.items():
-                    if isinstance(eval_result, dict):
-                        for k, v in eval_result.items():
-                            record[('evaluations', eval_name, k)] = v
-                            column_tuples.add(('evaluations', eval_name, k))
-                    else:
-                        record[('evaluations', eval_name)] = eval_result
-                        column_tuples.add(('evaluations', eval_name))
-
-                # Handle 'metadata' fields
-                for k, v in result.get('metadata', {}).items():
-                    record[('metadata', k)] = v
-                    column_tuples.add(('metadata', k))
-
-                # Handle 'config' fields
-                if self.experiment.config:
-                    for k, v in self.experiment.config.items():
-                        record[('config', k)] = v
-                        column_tuples.add(('config', k))
-
-                # Handle 'error' fields
-                error = result.get('error', {})
-                if error:
-                    for k, v in error.items():
-                        record[('error', k)] = v
-                        column_tuples.add(('error', k))
-
-                data_rows.append(record)
-            else:
-                # Non-multiindex implementation remains the same
-                new_record = {}
-                input_data = result.get('input', {})
-                new_record['input'] = input_data
-                expected_output = result.get('expected_output', {})
-                new_record['expected_output'] = expected_output
-                output = result.get('output', {})
-                new_record['output'] = output
-                new_record['evaluations'] = result.get('evaluations', {})
-                new_record['metadata'] = result.get('metadata', {})
-                new_record['config'] = self.experiment.config
-                new_record['error'] = result.get('error', {})
-                data_rows.append(new_record)
-
-        if multiindex:
-            # Sort column_tuples based on the desired order
-            column_tuples = sorted(list(column_tuples), 
-                                key=lambda x: (COLUMN_ORDER.index(x[0]), x[1:] if len(x) > 1 else ''))
-
-            # Build the DataFrame
-            records_list = []
-            for record in data_rows:
-                row = [record.get(col, None) for col in column_tuples]
-                records_list.append(row)
-
-            df = pd.DataFrame(records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
-            return df
-
-        df = pd.DataFrame(data_rows)
-        # Reorder columns according to COLUMN_ORDER
-        cols = [col for col in COLUMN_ORDER if col in df.columns]
-        return df[cols]
+        # Convert merged_results to DataFrame directly
+        df = pd.DataFrame(self.merged_results)
+        
+        if not multiindex:
+            # Reorder columns according to COLUMN_ORDER
+            cols = [col for col in COLUMN_ORDER if col in df.columns]
+            return df[cols]
+        
+        # For multiindex, we need to handle each column type differently
+        result_dfs = []
+        
+        # Handle input column
+        input_df = pd.DataFrame({'input': df['input'].values})
+        
+        # Handle expected_output column
+        expected_output_df = pd.DataFrame({'expected_output': df['expected_output'].values})
+        
+        # Handle output column - expand the nested structure
+        output_df = pd.json_normalize(
+            df['output'].fillna({}).values,
+            sep='_'
+        ).add_prefix('output_')
+        
+        # Handle evaluations - flatten the dictionary
+        evaluations_df = pd.DataFrame(df['evaluations'].values.tolist())
+        if not evaluations_df.empty:
+            evaluations_df = evaluations_df.astype(object)  # Ensure columns are of object type
+            evaluations_df = evaluations_df.add_prefix('evaluations_')
+            # Replace NaN with None
+            evaluations_df = evaluations_df.where(pd.notna(evaluations_df), None)
+        
+        # Handle metadata - flatten the dictionary
+        metadata_df = pd.DataFrame(df['metadata'].values.tolist())
+        if not metadata_df.empty:
+            metadata_df = metadata_df.add_prefix('metadata_')
+        
+        # Handle config if it exists
+        if 'config' in df.columns:
+            config_df = pd.json_normalize(
+                df['config'].fillna({}).values,
+                sep='_'
+            ).add_prefix('config_')
+        else:
+            config_df = pd.DataFrame()
+        
+        # Handle error column - flatten the dictionary and preserve None values
+        error_dicts = df['error'].values.tolist()
+        error_df = pd.DataFrame(error_dicts)
+        if not error_df.empty:
+            error_df = error_df.add_prefix('error_')
+        
+        # Combine all DataFrames
+        result_dfs = [
+            input_df,
+            expected_output_df,
+            output_df,
+            evaluations_df,
+            metadata_df,
+            config_df,
+            error_df
+        ]
+        
+        # Filter out empty DataFrames and concatenate
+        result_dfs = [df for df in result_dfs if not df.empty]
+        final_df = pd.concat(result_dfs, axis=1)
+        
+        # Replace NaN with None
+        final_df = final_df.where(pd.notna(final_df), None)
+        
+        # Create MultiIndex columns
+        new_columns = pd.MultiIndex.from_tuples([
+            tuple(col.split('_', 1)) if '_' in col else (col, '')
+            for col in final_df.columns
+        ])
+        final_df.columns = new_columns
+        
+        # Replace NaN with None for the entire DataFrame
+        final_df = final_df.where(pd.notna(final_df), None)
+        
+        return final_df
 
     def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
         """Push the experiment results to Datadog.
@@ -1007,15 +974,21 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
 
             # Add evaluation metrics
             for metric_name, metric_value in evaluations.items():
+                # Skip None values
+                if metric_value is None:
+                    print(f"Skipping None value for metric: {metric_name}")
+                    continue
+                    
                 timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
 
                 # Check for bool first, since bool is a subclass of int
-                if isinstance(metric_value, bool):
+                if isinstance(metric_value, (bool, str)):
                     metric_type = "categorical"
                     metric_value = str(metric_value).lower()
                 elif isinstance(metric_value, (int, float)):
                     metric_type = "score"
                 else:
+                    print(f"Unknown metric type: {type(metric_value)}")
                     metric_type = "categorical"
                     metric_value = str(metric_value)
 
@@ -1045,7 +1018,7 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
         exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
 
         # Print URL to the experiment in Datadog
-        print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}")
+        print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id} \n\n")
 
     def export_to_jsonl(self, file_path):
         """
@@ -1095,7 +1068,6 @@ def exp_http_request(method: str, url: str, body: Optional[bytes] = None) -> HTT
         if not DD_SITE:
             raise ValueError("DD_SITE may be incorrect. Please check your DD_SITE environment variable.")
         else:
-            print(resp.text())
             raise ValueError("DD_API_KEY or DD_APPLICATION_KEY is incorrect.")
     if resp.status_code >= 400:
         try:
diff --git a/tests/llmobs/test_llmobs_experiments.py b/tests/llmobs/test_llmobs_experiments.py
index 7d3612e5bd8..5b4224af8c7 100644
--- a/tests/llmobs/test_llmobs_experiments.py
+++ b/tests/llmobs/test_llmobs_experiments.py
@@ -1,3 +1,12 @@
+"""
+Test coverage ideas:
+- Define task and evaluator wrong
+- Define experiment wrong
+- Experiments with failures
+- Eval failures
+- Test workflows for re-evals and publishing results
+"""
+
 import itertools
 import os
 from typing import Any

From ba8e8070ac9f9a5fd3f6b5b516e7914e7207dc6a Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 13 Jan 2025 13:16:09 -0500
Subject: [PATCH 30/36] add error fields on evals

---
 ddtrace/llmobs/_experiments.py | 208 ++++++++++++++-------------------
 1 file changed, 86 insertions(+), 122 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index c576827b602..72128e04fa8 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -7,6 +7,7 @@
 import json
 import os
 import time
+import traceback
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 from urllib.parse import quote
 import uuid
@@ -14,6 +15,8 @@
 from ._utils import HTTPResponse
 from ._utils import http_request
 
+from decorators import agent
+
 import ddtrace
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
@@ -127,6 +130,9 @@ def pull(cls, name: str) -> "Dataset":
         resp = exp_http_request("GET", url)
         records_data = resp.json()
 
+        if not records_data.get("data", []):
+            raise ValueError(f"Dataset '{name}' does not contain any records.")
+
         # Transform records into the expected format
         class_records = []
         for record in records_data.get("data", []):
@@ -534,7 +540,7 @@ def process_row(idx_row):
                     },
                     "error": {
                         "message": error_message,
-                        "stack": None,
+                        "stack": traceback.format_exc(),
                         "type": type(e).__name__,
                     }
                 }
@@ -571,7 +577,7 @@ def process_row(idx_row):
                             },
                             "error": {
                                 "message": str(e),
-                                "stack": None,
+                                "stack": traceback.format_exc(),
                                 "type": type(e).__name__,
                             }
                         }
@@ -624,53 +630,49 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
         evaluations = []
         total_rows = len(self.outputs)
         completed = 0
-        error_count = 0  
+        error_count = 0
 
         _print_progress_bar(0, total_rows, prefix='Evaluating:', suffix='Complete')
 
         for idx, output_data in enumerate(self.outputs):
-            try:
-                output = output_data["output"]
-
-                dataset_row = self.dataset[idx]
-                input_data = dataset_row.get('input', {})
-                expected_output = dataset_row.get('expected_output', {})
-
-                evaluations_dict = {}
-                for evaluator in evaluators_to_use:
-                    try:
-                        evaluation_result = evaluator(input_data, output, expected_output)
-                        evaluations_dict[evaluator.__name__] = evaluation_result
-                    except Exception as e:
-                        error_count += 1
-                        if raise_errors:
-                            raise e
-
-                evaluations.append({
-                    "idx": idx,
-                    "evaluations": evaluations_dict,
-                    "error": None,
-                })
-
-            except Exception as e:
-                if raise_errors:
-                    raise e
-                error_count += 1
+            output = output_data["output"]
+            dataset_row = self.dataset[idx]
+            input_data = dataset_row.get('input', {})
+            expected_output = dataset_row.get('expected_output', {})
             
-                evaluations.append({
-                    "idx": idx,
-                    "evaluations": {},
-                    "error": {
-                        "message": str(e),
-                        "type": type(e).__name__,
-                        "stack": None,
-                    },
-                })
+            evaluations_dict = {}
+
+            # Run all evaluators for this output
+            for evaluator in evaluators_to_use:
+                try:
+                    evaluation_result = evaluator(input_data, output, expected_output)
+                    evaluations_dict[evaluator.__name__] = {
+                        "value": evaluation_result,
+                        "error": None
+                    }
+                except Exception as e:
+                    error_count += 1
+                    evaluations_dict[evaluator.__name__] = {
+                        "value": None,
+                        "error": {
+                            "message": str(e),
+                            "type": type(e).__name__,
+                            "stack": traceback.format_exc(),
+                        }
+                    }
+                    if raise_errors:
+                        raise e
+
+            # Add single evaluation entry for this output
+            evaluations.append({
+                "idx": idx,
+                "evaluations": evaluations_dict
+            })
 
             completed += 1
             _print_progress_bar(completed, total_rows, prefix='Evaluating:', suffix='Complete')
 
-        if len(evaluators_to_use) > 0:  
+        if len(evaluators_to_use) > 0:
             error_rate = (error_count / (total_rows * len(evaluators_to_use))) * 100
         else:
             error_rate = 0
@@ -679,7 +681,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
 
         if error_count > 0:
             print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.")
-      
+
         self.has_evaluated = True
         return ExperimentResults(self.dataset, self, self.outputs, evaluations)
 
@@ -732,15 +734,18 @@ def _merge_results(self) -> List[Dict[str, Any]]:
             evaluation_data = self.evaluations[idx]
             dataset_record = self.dataset._data[idx]
 
+            # Get base metadata and add tags to it
+            metadata = output_data.get('metadata', {})
+            metadata['tags'] = self.experiment.tags
+
             merged_result = {
                 "idx": idx,
                 "input": dataset_record.get('input', {}),
                 "expected_output": dataset_record.get('expected_output', {}),
                 "output": output_data.get('output'),
                 "evaluations": evaluation_data.get('evaluations', {}),
-                "metadata": output_data.get('metadata', {}),
+                "metadata": metadata,
                 "error": output_data.get('error'),
-                "tags": self.experiment.tags,
             }
             merged_results.append(merged_result)
         return merged_results
@@ -764,14 +769,14 @@ def __getitem__(self, index: int) -> Any:
         return result
 
     def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
-        """Convert the experiment results to a pandas DataFrame, including the experiment config.
+        """Convert the experiment results to a pandas DataFrame.
 
         Args:
-            multiindex (bool): If True, expand nested dictionaries into MultiIndex columns.
+            multiindex (bool): If True, expand input/output/expected_output dictionaries into MultiIndex columns.
                             If False, keep the nested dictionaries as they are.
 
         Returns:
-            pd.DataFrame: A DataFrame representation of the experiment results.
+            pd.DataFrame: DataFrame representation of the experiment results.
 
         Raises:
             ImportError: If pandas is not installed.
@@ -784,88 +789,47 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
                 "Please install it with `pip install pandas`"
             )
 
-        # Define the desired column order
-        COLUMN_ORDER = ['input', 'expected_output', 'output', 'evaluations', 'metadata', 'config', 'error']
-        
         # Convert merged_results to DataFrame directly
         df = pd.DataFrame(self.merged_results)
         
         if not multiindex:
-            # Reorder columns according to COLUMN_ORDER
-            cols = [col for col in COLUMN_ORDER if col in df.columns]
-            return df[cols]
+            return df
         
-        # For multiindex, we need to handle each column type differently
+        # Process input, output, and expected_output with MultiIndex
+        special_fields = ['input', 'output', 'expected_output']
         result_dfs = []
         
-        # Handle input column
-        input_df = pd.DataFrame({'input': df['input'].values})
-        
-        # Handle expected_output column
-        expected_output_df = pd.DataFrame({'expected_output': df['expected_output'].values})
-        
-        # Handle output column - expand the nested structure
-        output_df = pd.json_normalize(
-            df['output'].fillna({}).values,
-            sep='_'
-        ).add_prefix('output_')
-        
-        # Handle evaluations - flatten the dictionary
-        evaluations_df = pd.DataFrame(df['evaluations'].values.tolist())
-        if not evaluations_df.empty:
-            evaluations_df = evaluations_df.astype(object)  # Ensure columns are of object type
-            evaluations_df = evaluations_df.add_prefix('evaluations_')
-            # Replace NaN with None
-            evaluations_df = evaluations_df.where(pd.notna(evaluations_df), None)
-        
-        # Handle metadata - flatten the dictionary
-        metadata_df = pd.DataFrame(df['metadata'].values.tolist())
-        if not metadata_df.empty:
-            metadata_df = metadata_df.add_prefix('metadata_')
-        
-        # Handle config if it exists
-        if 'config' in df.columns:
-            config_df = pd.json_normalize(
-                df['config'].fillna({}).values,
-                sep='_'
-            ).add_prefix('config_')
-        else:
-            config_df = pd.DataFrame()
+        # Handle special fields (input, output, expected_output)
+        for field in special_fields:
+            if field not in df.columns:
+                continue
+                
+            # Get the first non-null value to check type
+            first_value = next((v for v in df[field] if v is not None), None)
+            
+            if isinstance(first_value, dict):
+                # For dictionary values, expand into columns
+                field_df = pd.json_normalize(df[field].values)
+            else:
+                # For simple values, use 'value' as the subcolumn
+                field_df = pd.DataFrame({'value': df[field].values})
+            
+            # Create MultiIndex columns for this field
+            field_df.columns = pd.MultiIndex.from_tuples([(field, col) for col in field_df.columns])
+            result_dfs.append(field_df)
         
-        # Handle error column - flatten the dictionary and preserve None values
-        error_dicts = df['error'].values.tolist()
-        error_df = pd.DataFrame(error_dicts)
-        if not error_df.empty:
-            error_df = error_df.add_prefix('error_')
+        # Add all other columns as-is
+        other_cols = [col for col in df.columns if col not in special_fields]
+        if other_cols:
+            other_df = df[other_cols]
+            result_dfs.append(other_df)
         
         # Combine all DataFrames
-        result_dfs = [
-            input_df,
-            expected_output_df,
-            output_df,
-            evaluations_df,
-            metadata_df,
-            config_df,
-            error_df
-        ]
-        
-        # Filter out empty DataFrames and concatenate
-        result_dfs = [df for df in result_dfs if not df.empty]
         final_df = pd.concat(result_dfs, axis=1)
         
         # Replace NaN with None
         final_df = final_df.where(pd.notna(final_df), None)
         
-        # Create MultiIndex columns
-        new_columns = pd.MultiIndex.from_tuples([
-            tuple(col.split('_', 1)) if '_' in col else (col, '')
-            for col in final_df.columns
-        ])
-        final_df.columns = new_columns
-        
-        # Replace NaN with None for the entire DataFrame
-        final_df = final_df.where(pd.notna(final_df), None)
-        
         return final_df
 
     def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
@@ -973,31 +937,31 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
             spans.append(span)
 
             # Add evaluation metrics
-            for metric_name, metric_value in evaluations.items():
+            for metric_payload_name, metric_payload_value in evaluations.items():
                 # Skip None values
-                if metric_value is None:
-                    print(f"Skipping None value for metric: {metric_name}")
+                if metric_payload_value is None:
+                    print(f"Skipping None value for metric: {metric_payload_name}")
                     continue
                     
                 timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
 
                 # Check for bool first, since bool is a subclass of int
-                if isinstance(metric_value, (bool, str)):
+                if isinstance(metric_payload_value["value"], (bool, str)):
                     metric_type = "categorical"
-                    metric_value = str(metric_value).lower()
-                elif isinstance(metric_value, (int, float)):
+                    metric_value = str(metric_payload_value["value"]).lower()
+                elif isinstance(metric_payload_value["value"], (int, float)):
                     metric_type = "score"
                 else:
-                    print(f"Unknown metric type: {type(metric_value)}")
                     metric_type = "categorical"
-                    metric_value = str(metric_value)
+                    metric_value = str(metric_payload_value["value"])
 
                 metric = {
                     "span_id": span["span_id"],
                     "metric_type": metric_type,
                     "timestamp_ms": timestamp_ms,
-                    "label": metric_name,
+                    "label": metric_payload_name,
                     "score_value" if metric_type == "score" else "categorical_value": metric_value,
+                    "error": metric_payload_value["error"],
                 }
 
                 metrics.append(metric)

From 2a73462d4d880d9fd6d372af3f73d9bf534fec2f Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Wed, 15 Jan 2025 12:48:14 -0500
Subject: [PATCH 31/36] encode llm events in utf-8

---
 ddtrace/llmobs/_writer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 5880019d67f..e1dd9280ff7 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -210,6 +210,8 @@ def encode(self):
         data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events}
         try:
             enc_llm_events = safe_json(data)
+            if isinstance(enc_llm_events, str):
+                enc_llm_events = enc_llm_events.encode('utf-8')
             logger.debug("encode %d LLMObs span events to be sent", len(events))
         except TypeError:
             logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True)

From 9497ea8717823b639ce338eda45075d90119005f Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Wed, 15 Jan 2025 12:51:01 -0500
Subject: [PATCH 32/36] tracing works

---
 ddtrace/llmobs/__init__.py     |   2 +-
 ddtrace/llmobs/_experiments.py | 264 ++++++++++++++++++++-------------
 2 files changed, 166 insertions(+), 100 deletions(-)

diff --git a/ddtrace/llmobs/__init__.py b/ddtrace/llmobs/__init__.py
index f2e91e8a1ca..26bc6f964b3 100644
--- a/ddtrace/llmobs/__init__.py
+++ b/ddtrace/llmobs/__init__.py
@@ -5,12 +5,12 @@
     from ddtrace.llmobs import LLMObs
     LLMObs.enable()
 """
+from ._llmobs import LLMObs
 
 from ._experiments import Dataset
 from ._experiments import Experiment
 from ._experiments import task
 from ._experiments import evaluator
-from ._llmobs import LLMObs
 
 
 __all__ = ["LLMObs", "Dataset", "Experiment", "task", "evaluator"]
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 72128e04fa8..b9dfc360e64 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -15,7 +15,10 @@
 from ._utils import HTTPResponse
 from ._utils import http_request
 
-from decorators import agent
+from .decorators import agent
+from ._llmobs import LLMObs  
+
+from ddtrace.context import Context
 
 import ddtrace
 
@@ -28,6 +31,14 @@
 class FileType(Enum):
     CSV = 'csv'
 
+LLMObs.enable(
+    ml_app="experiment-jonathan",
+    integrations_enabled=True,
+    agentless_enabled=True,
+    site="datadoghq.com",
+    api_key=os.getenv("DD_API_KEY"),
+)
+
 
 class Dataset:
     """A container for LLM experiment data that can be pushed to and retrieved from Datadog.
@@ -40,21 +51,29 @@ class Dataset:
         description (str): Optional description of the dataset
     """
 
-    def __init__(self, name: str, data: List[Dict[str, Union[str, Dict[str, Any]]]], description: str = "") -> None:
+    def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str, Any]]]]] = None, description: str = "") -> None:
         """
         Args:
             name: Name of the dataset
             data: List of dictionaries where 'input' and 'expected_output' values can be
-                 either strings or dictionaries of strings
+                 either strings or dictionaries of strings. If None, attempts to pull from Datadog.
             description: Optional description of the dataset
         """
         self.name = name
         self.description = description
-        self._validate_data(data)
-        self._data = data
 
-        # Post-push attributes
-        self._datadog_dataset_id = None
+        # If no data provided, attempt to pull from Datadog
+        if data is None:
+            print(
+                f"No data provided, pulling dataset '{name}' from Datadog..."
+            )
+            pulled_dataset = self.pull(name)
+            self._data = pulled_dataset._data
+            self._datadog_dataset_id = pulled_dataset._datadog_dataset_id
+        else:
+            self._validate_data(data)
+            self._data = data
+            self._datadog_dataset_id = None
 
     def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]:
         return iter(self._data)
@@ -151,9 +170,12 @@ def pull(cls, name: str) -> "Dataset":
         dataset._datadog_dataset_id = dataset_id
         return dataset
 
-    def push(self) -> None:
+    def push(self, chunk_size: int = 300) -> None:
         """Push the dataset to Datadog.
 
+        Args:
+            chunk_size: Number of records to upload in each chunk. Defaults to 300.
+
         Returns:
             Dict[str, Any]: Dictionary containing dataset information including:
                 - dataset_id: The ID of the created/updated dataset
@@ -192,14 +214,27 @@ def push(self) -> None:
                 "Please use a different name for your dataset."
             )
 
-        # Add records to the dataset
-        records_payload = {"data": {"type": "datasets", "attributes": {"records": self._data}}}
-        url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
-        resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8"))
-        data = resp.json()
+        # Split records into chunks and upload
+        total_records = len(self._data)
+        chunks = [self._data[i:i + chunk_size] for i in range(0, total_records, chunk_size)]
+        total_chunks = len(chunks)
+
+        # Only show progress bar for large datasets
+        show_progress = total_records > chunk_size
+        if show_progress:
+            print(f"\nUploading {total_records} records in {total_chunks} chunks...")
+            _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete')
+
+        for i, chunk in enumerate(chunks):
+            records_payload = {"data": {"type": "datasets", "attributes": {"records": chunk}}}
+            url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
+            resp = exp_http_request("POST", url, body=json.dumps(records_payload).encode("utf-8"))
+            
+            if show_progress:
+                _print_progress_bar(i + 1, total_chunks, prefix='Uploading:', suffix='Complete')
 
         # Print url to the dataset in Datadog
-        print(f"Dataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n\n")
+        print(f"\nDataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n")
 
     @classmethod
     def from_csv(
@@ -485,10 +520,14 @@ def run_task(
                           errors in the output. Defaults to False.
 
         Raises:
-            ValueError: If _jobs is not between 1 and 20
+            ValueError: If _jobs is not between 1 and 30
         """
-        if not 1 <= _jobs <= 20:
-            raise ValueError("Number of jobs must be between 1 and 20")
+        if not 1 <= _jobs <= 30:
+            raise ValueError("Number of jobs must be between 1 and 30")
+        
+        @agent
+        def instrumented_task(input_data, config=None): # To trace the task
+            return self.task(input_data, config)
         
         self.outputs = []
         total_rows = len(self.dataset)
@@ -498,14 +537,20 @@ def run_task(
         def process_row(idx_row):
             idx, row = idx_row
             start_time = time.time()
+            ddtrace.tracer.context_provider.activate(Context())
+            
             try:
                 input_data = row['input']
                 
                 if getattr(self.task, '_accepts_config', False):
-                    output = self.task(input_data, self.config)
+                    output = instrumented_task(input_data, self.config)
                 else:
-                    output = self.task(input_data)
-
+                    output = instrumented_task(input_data)
+                
+                # Periodic flush every 10 rows (approximate because it's concurrent)
+                if idx % 10 == 0:
+                    LLMObs.flush()
+                
                 output_data = {
                     "idx": idx,
                     "output": output,
@@ -560,6 +605,7 @@ def process_row(idx_row):
                         if raise_errors and output_data['error']['message']:
                             error_message = output_data['error']['message']
                             raise ExperimentTaskError(error_message, idx, output_data['error']['type'])
+                            
                         elif output_data['error']['message']:
                             error_count += 1
 
@@ -597,6 +643,9 @@ def process_row(idx_row):
 
         self.outputs = outputs_buffer
         self.has_run = True
+        
+        # Final flush at the end
+        LLMObs.flush()
 
         error_rate = (error_count / total_rows) * 100
         print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
@@ -832,9 +881,12 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         
         return final_df
 
-    def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
+    def push(self, chunk_size: int = 300) -> None:
         """Push the experiment results to Datadog.
 
+        Args:
+            chunk_size: Number of records to upload in each chunk. Defaults to 300.
+
         Raises:
             ValueError: If the dataset hasn't been pushed to Datadog first
         """
@@ -896,93 +948,107 @@ def push(self, overwrite: bool = False) -> None: # TODO: Implement overwrite
         experiment_id = response_data["data"]["id"]
         self.experiment.name = response_data["data"]["attributes"]["name"]
 
-        spans = []
-        metrics = []
-        for result in self.merged_results:
-            idx = result['idx']
-            merged_result = result
-            output = merged_result.get('output')
-            input = merged_result.get('input', {})
-            evaluations = merged_result.get('evaluations', {})
-            expected_output = merged_result.get('expected_output', {})
-            metadata = merged_result.get('metadata', {})
-            error = merged_result.get('error', {})
-
-            # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id
-            dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest()
-
-            span = {
-                "span_id": _make_id(),
-                "project_id": project_id,
-                "experiment_id": experiment_id,
-                "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                #TODO: Extract the record id from the dataset for hosted datasets
-                "dataset_record_id": dataset_record_id,
-                "start_ns": int(metadata.get("timestamp", time.time()) * 1e9),
-                "duration": float(metadata.get("duration", 0) * 1e9),
-                "status": "ok" if not error else "error",
-                "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
-                "meta": {
-                    "span": {"kind": "experiment"},
-                    "input": merged_result.get('input', {}),
-                    "output": output,
-                    "expected_output": merged_result.get('expected_output', {}),
-                    "error": {
-                        "message": error.get("message"),
-                        "type": error.get("type"),
-                        "stack": error.get("stack"),
-                    }
-                },
-            }
-            spans.append(span)
-
-            # Add evaluation metrics
-            for metric_payload_name, metric_payload_value in evaluations.items():
-                # Skip None values
-                if metric_payload_value is None:
-                    print(f"Skipping None value for metric: {metric_payload_name}")
-                    continue
-                    
-                timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
-
-                # Check for bool first, since bool is a subclass of int
-                if isinstance(metric_payload_value["value"], (bool, str)):
-                    metric_type = "categorical"
-                    metric_value = str(metric_payload_value["value"]).lower()
-                elif isinstance(metric_payload_value["value"], (int, float)):
-                    metric_type = "score"
-                else:
-                    metric_type = "categorical"
-                    metric_value = str(metric_payload_value["value"])
-
-                metric = {
-                    "span_id": span["span_id"],
-                    "metric_type": metric_type,
-                    "timestamp_ms": timestamp_ms,
-                    "label": metric_payload_name,
-                    "score_value" if metric_type == "score" else "categorical_value": metric_value,
-                    "error": metric_payload_value["error"],
-                }
+        # Process results in chunks
+        total_results = len(self.merged_results)
+        chunks = [self.merged_results[i:i + chunk_size] for i in range(0, total_results, chunk_size)]
+        total_chunks = len(chunks)
 
-                metrics.append(metric)
+        # Only show progress bar for large result sets
+        show_progress = total_results > chunk_size
+        if show_progress:
+            print(f"\nUploading {total_results} results in {total_chunks} chunks...")
+            _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete')
 
+        for chunk_idx, chunk in enumerate(chunks):
+            spans = []
+            metrics = []
+            
+            # Process each result in the chunk
+            for result in chunk:
+                idx = result['idx']
+                merged_result = result
+                output = merged_result.get('output')
+                input = merged_result.get('input', {})
+                evaluations = merged_result.get('evaluations', {})
+                expected_output = merged_result.get('expected_output', {})
+                metadata = merged_result.get('metadata', {})
+                error = merged_result.get('error', {})
+
+                # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id
+                dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest()
+
+                span = {
+                    "span_id": _make_id(),
+                    "project_id": project_id,
+                    "experiment_id": experiment_id,
+                    "dataset_id": self.experiment.dataset._datadog_dataset_id,
+                    #TODO: Extract the record id from the dataset for hosted datasets
+                    "dataset_record_id": dataset_record_id,
+                    "start_ns": int(metadata.get("timestamp", time.time()) * 1e9),
+                    "duration": float(metadata.get("duration", 0) * 1e9),
+                    "status": "ok" if not error else "error",
+                    "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
+                    "meta": {
+                        "span": {"kind": "experiment"},
+                        "input": merged_result.get('input', {}),
+                        "output": output,
+                        "expected_output": merged_result.get('expected_output', {}),
+                        "error": {
+                            "message": error.get("message"),
+                            "type": error.get("type"),
+                            "stack": error.get("stack"),
+                        }
+                    },
+                }
+                spans.append(span)
+
+                # Add evaluation metrics
+                for metric_payload_name, metric_payload_value in evaluations.items():
+                    # Skip None values
+                    if metric_payload_value is None:
+                        print(f"Skipping None value for metric: {metric_payload_name}")
+                        continue
+                        
+                    timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
+
+                    # Check for bool first, since bool is a subclass of int
+                    if isinstance(metric_payload_value["value"], (bool, str)):
+                        metric_type = "categorical"
+                        metric_value = str(metric_payload_value["value"]).lower()
+                    elif isinstance(metric_payload_value["value"], (int, float)):
+                        metric_type = "score"
+                    else:
+                        metric_type = "categorical"
+                        metric_value = str(metric_payload_value["value"])
+
+                    metric = {
+                        "span_id": span["span_id"],
+                        "metric_type": metric_type,
+                        "timestamp_ms": timestamp_ms,
+                        "label": metric_payload_name,
+                        "score_value" if metric_type == "score" else "categorical_value": metric_value,
+                        "error": metric_payload_value["error"],
+                    }
 
+                    metrics.append(metric)
 
-        # Prepare payload and send to Datadog
-        results_payload = {
-            "data": {
-                "type": "experiments",
-                "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__],
-                "attributes": {"spans": spans, "metrics": metrics},
+            # Prepare and send chunk payload
+            chunk_payload = {
+                "data": {
+                    "type": "experiments",
+                    "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__],
+                    "attributes": {"spans": spans, "metrics": metrics},
+                }
             }
-        }
 
+            url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
+            exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8"))
 
-        url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
-        exp_http_request("POST", url, body=json.dumps(results_payload).encode("utf-8"))
+            if show_progress:
+                _print_progress_bar(chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete')
 
         # Print URL to the experiment in Datadog
-        print(f"Experiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id} \n\n")
+        print(f"\nExperiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}\n")
 
     def export_to_jsonl(self, file_path):
         """

From c05deb83d83c5a5a9ef2e08e31c4d80e7a7c3f02 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Mon, 27 Jan 2025 14:24:32 -0500
Subject: [PATCH 33/36] two buffers temporary

---
 .../contrib/internal/requests/connection.py   |   4 +
 ddtrace/llmobs/_constants.py                  |   2 +-
 ddtrace/llmobs/_experiments.py                |  45 ++++--
 ddtrace/llmobs/_llmobs.py                     |  32 +++++
 ddtrace/llmobs/_utils.py                      |   2 +
 ddtrace/llmobs/_writer.py                     | 129 +++++++++++++++---
 6 files changed, 187 insertions(+), 27 deletions(-)

diff --git a/ddtrace/contrib/internal/requests/connection.py b/ddtrace/contrib/internal/requests/connection.py
index 06d3347f0a1..d7a19ec6eb0 100644
--- a/ddtrace/contrib/internal/requests/connection.py
+++ b/ddtrace/contrib/internal/requests/connection.py
@@ -102,7 +102,11 @@ def _wrap_send(func, instance, args, kwargs):
             span.set_tag(_ANALYTICS_SAMPLE_RATE_KEY, cfg.get("analytics_sample_rate", True))
 
         # propagate distributed tracing headers
+        # breakpoint()
         if cfg.get("distributed_tracing"):
+            # breakpoint()
+            print("propagating headers")
+            print(span.context)
             HTTPPropagator.inject(span.context, request.headers)
 
         response = response_headers = None
diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
index 27000b36aac..05f3b599664 100644
--- a/ddtrace/llmobs/_constants.py
+++ b/ddtrace/llmobs/_constants.py
@@ -15,7 +15,7 @@
 INPUT_VALUE = "_ml_obs.meta.input.value"
 INPUT_PARAMETERS = "_ml_obs.meta.input.parameters"
 INPUT_PROMPT = "_ml_obs.meta.input.prompt"
-
+EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
 OUTPUT_DOCUMENTS = "_ml_obs.meta.output.documents"
 OUTPUT_MESSAGES = "_ml_obs.meta.output.messages"
 OUTPUT_VALUE = "_ml_obs.meta.output.value"
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index b9dfc360e64..835855f5d5b 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -21,6 +21,9 @@
 from ddtrace.context import Context
 
 import ddtrace
+from ddtrace import patch_all
+
+patch_all()
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
 if DD_SITE == "datadoghq.com":
@@ -35,11 +38,12 @@ class FileType(Enum):
     ml_app="experiment-jonathan",
     integrations_enabled=True,
     agentless_enabled=True,
-    site="datadoghq.com",
+    site=os.getenv("DD_SITE"),
     api_key=os.getenv("DD_API_KEY"),
 )
 
 
+
 class Dataset:
     """A container for LLM experiment data that can be pushed to and retrieved from Datadog.
 
@@ -61,6 +65,8 @@ def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str,
         """
         self.name = name
         self.description = description
+        self.version = 0
+        
 
         # If no data provided, attempt to pull from Datadog
         if data is None:
@@ -70,10 +76,12 @@ def __init__(self, name: str, data: Optional[List[Dict[str, Union[str, Dict[str,
             pulled_dataset = self.pull(name)
             self._data = pulled_dataset._data
             self._datadog_dataset_id = pulled_dataset._datadog_dataset_id
+            self._version = pulled_dataset._datadog_dataset_version
         else:
             self._validate_data(data)
             self._data = data
             self._datadog_dataset_id = None
+            self._version = 0
 
     def __iter__(self) -> Iterator[Dict[str, Union[str, Dict[str, Any]]]]:
         return iter(self._data)
@@ -143,6 +151,8 @@ def pull(cls, name: str) -> "Dataset":
             raise ValueError(f"Dataset '{name}' not found")
 
         dataset_id = datasets[0]["id"]
+        dataset_version = datasets[0]["attributes"]["current_version"]
+        
 
         # Get dataset records
         url = f"/api/unstable/llm-obs/v1/datasets/{dataset_id}/records"
@@ -168,6 +178,7 @@ def pull(cls, name: str) -> "Dataset":
         # Create new dataset instance
         dataset = cls(name, class_records)
         dataset._datadog_dataset_id = dataset_id
+        dataset._datadog_dataset_version = dataset_version
         return dataset
 
     def push(self, chunk_size: int = 300) -> None:
@@ -207,6 +218,7 @@ def push(self, chunk_size: int = 300) -> None:
             response_data = resp.json()
             dataset_id = response_data["data"]["id"]
             self._datadog_dataset_id = dataset_id
+            self._datadog_dataset_version = 0
         else:
             # Dataset exists, raise error
             raise ValueError(
@@ -522,12 +534,17 @@ def run_task(
         Raises:
             ValueError: If _jobs is not between 1 and 30
         """
+        os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True"
         if not 1 <= _jobs <= 30:
             raise ValueError("Number of jobs must be between 1 and 30")
         
-        @agent
-        def instrumented_task(input_data, config=None): # To trace the task
-            return self.task(input_data, config)
+        def instrumented_task(input_data, expected_output, config=None): 
+            with LLMObs._experiment(name="experiment-task") as span:
+                span.context.set_baggage_item("is_experiment_task", True)
+                output = self.task(input_data, config)
+                # LLMObs._tag_expected_output(span, expected_output)
+                LLMObs.annotate(span, input_data=input_data, output_data=output)
+                return output
         
         self.outputs = []
         total_rows = len(self.dataset)
@@ -537,15 +554,15 @@ def instrumented_task(input_data, config=None): # To trace the task
         def process_row(idx_row):
             idx, row = idx_row
             start_time = time.time()
-            ddtrace.tracer.context_provider.activate(Context())
             
             try:
                 input_data = row['input']
+                expected_output = row['expected_output']
                 
                 if getattr(self.task, '_accepts_config', False):
-                    output = instrumented_task(input_data, self.config)
+                    output = instrumented_task(input_data, expected_output, self.config)
                 else:
-                    output = instrumented_task(input_data)
+                    output = instrumented_task(input_data, expected_output)
                 
                 # Periodic flush every 10 rows (approximate because it's concurrent)
                 if idx % 10 == 0:
@@ -648,6 +665,8 @@ def process_row(idx_row):
         LLMObs.flush()
 
         error_rate = (error_count / total_rows) * 100
+        os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "False"
+        os.environ["DD_LLMOBS_ENABLED"] = "False"
         print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
         if error_count > 0:
             print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.")
@@ -932,6 +951,7 @@ def push(self, chunk_size: int = 300) -> None:
                     "description": self.experiment.description,
                     "dataset_id": self.experiment.dataset._datadog_dataset_id,
                     "project_id": project_id,
+                    "dataset_version": self.experiment.dataset._datadog_dataset_version,
                     "metadata": {
                         "tags": self.experiment.tags,
                         **(self.experiment.metadata or {}),
@@ -989,7 +1009,7 @@ def push(self, chunk_size: int = 300) -> None:
                     "status": "ok" if not error else "error",
                     "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
                     "meta": {
-                        "span": {"kind": "experiment"},
+                        "span": {"kind": "experiment-result"},
                         "input": merged_result.get('input', {}),
                         "output": output,
                         "expected_output": merged_result.get('expected_output', {}),
@@ -1011,16 +1031,21 @@ def push(self, chunk_size: int = 300) -> None:
                         
                     timestamp_ms = int(metadata.get("timestamp", time.time()) * 1000)
 
+                    if metric_payload_value["value"] == None:
+                        metric_type = "categorical"
+                        metric_value = None
                     # Check for bool first, since bool is a subclass of int
-                    if isinstance(metric_payload_value["value"], (bool, str)):
+                    elif isinstance(metric_payload_value["value"], (bool, str)):
                         metric_type = "categorical"
                         metric_value = str(metric_payload_value["value"]).lower()
                     elif isinstance(metric_payload_value["value"], (int, float)):
                         metric_type = "score"
+                        metric_value = metric_payload_value["value"]
                     else:
                         metric_type = "categorical"
                         metric_value = str(metric_payload_value["value"])
 
+
                     metric = {
                         "span_id": span["span_id"],
                         "metric_type": metric_type,
@@ -1037,7 +1062,7 @@ def push(self, chunk_size: int = 300) -> None:
                 "data": {
                     "type": "experiments",
                     "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__],
-                    "attributes": {"spans": spans, "metrics": metrics},
+                    "attributes": {"spans": [], "metrics": []} #metrics}, #TODO: Remove this whole thing since experiment spans results will be part of tracing
                 }
             }
 
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index b4f1dc1b2f6..0da238f3d0d 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -52,6 +52,7 @@
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
+from ddtrace.llmobs._constants import EXPECTED_OUTPUT
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.llmobs._utils import AnnotationContext
 from ddtrace.llmobs._utils import _get_llmobs_parent_id
@@ -193,6 +194,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
         span._set_ctx_item(ML_APP, ml_app)
         parent_id = str(_get_llmobs_parent_id(span) or "undefined")
 
+        if span._get_ctx_item(EXPECTED_OUTPUT) is not None:
+            meta["expected_output"] = span._get_ctx_item(EXPECTED_OUTPUT)
+
         llmobs_span_event = {
             "trace_id": "{:x}".format(span.trace_id),
             "span_id": str(span.span_id),
@@ -626,6 +630,22 @@ def agent(cls, name: Optional[str] = None, session_id: Optional[str] = None, ml_
         if cls.enabled is False:
             log.warning(SPAN_START_WHILE_DISABLED_WARNING)
         return cls._instance._start_span("agent", name=name, session_id=session_id, ml_app=ml_app)
+    
+    @classmethod
+    def _experiment(cls, name: Optional[str] = None, session_id: Optional[str] = None, ml_app: Optional[str] = None) -> Span:
+        """
+        Trace a dynamic workflow in which an embedded language model (agent) decides what sequence of actions to take.
+
+        :param str name: The name of the traced operation. If not provided, a default value of "agent" will be set.
+        :param str session_id: The ID of the underlying user session. Required for tracking sessions.
+        :param str ml_app: The name of the ML application that the agent is orchestrating. If not provided, the default
+                           value will be set to the value of `DD_LLMOBS_ML_APP`.
+
+        :returns: The Span object representing the traced operation.
+        """
+        if cls.enabled is False:
+            log.warning(SPAN_START_WHILE_DISABLED_WARNING)
+        return cls._instance._start_span("experiment", name=name, session_id=session_id, ml_app=ml_app)
 
     @classmethod
     def workflow(
@@ -788,6 +808,18 @@ def annotate(
             else:
                 cls._tag_text_io(span, input_value=input_data, output_value=output_data)
 
+    @staticmethod
+    def _tag_expected_output(span, expected_output: dict) -> None:
+        """Tags a given LLMObs span with a prompt"""
+        try:
+            span._set_ctx_item(EXPECTED_OUTPUT, expected_output)
+            print("added expected output")
+            print("expected output: ", span._get_ctx_item(EXPECTED_OUTPUT))
+            print("span: ", span)
+        except TypeError:
+            log.warning("Failed to validate expected output with error: ", exc_info=True)
+            return
+
     @staticmethod
     def _tag_prompt(span, prompt: dict) -> None:
         """Tags a given LLMObs span with a prompt"""
diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py
index 827603cc93d..74751944621 100644
--- a/ddtrace/llmobs/_utils.py
+++ b/ddtrace/llmobs/_utils.py
@@ -167,6 +167,7 @@ def _get_session_id(span: Span) -> Optional[str]:
 def _inject_llmobs_parent_id(span_context):
     """Inject the LLMObs parent ID into the span context for reconnecting distributed LLMObs traces."""
     span = ddtrace.tracer.current_span()
+    
     if span is None:
         log.warning("No active span to inject LLMObs parent ID info.")
         return
@@ -178,6 +179,7 @@ def _inject_llmobs_parent_id(span_context):
         llmobs_parent_id = str(span.span_id)
     else:
         llmobs_parent_id = _get_llmobs_parent_id(span)
+
     span_context._meta[PROPAGATED_PARENT_ID_KEY] = llmobs_parent_id or "undefined"
 
 
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index e1dd9280ff7..b8a2756d22c 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -32,6 +32,8 @@
 from ddtrace.llmobs._constants import EVP_SUBDOMAIN_HEADER_NAME
 from ddtrace.llmobs._constants import EVP_SUBDOMAIN_HEADER_VALUE
 from ddtrace.llmobs._utils import safe_json
+from ddtrace.internal.utils.formats import asbool
+import os
 
 
 logger = get_logger(__name__)
@@ -188,35 +190,101 @@ def __len__(self):
     def _init_buffer(self):
         with self._lock:
             self._buffer = []
+            self._experiment_buffer = [] 
             self.buffer_size = 0
 
     def put(self, events: List[LLMObsSpanEvent]):
-        # events always has only 1 event - with List type to be compatible with HTTPWriter interfaces
-        with self._lock:
-            if len(self._buffer) >= self._buffer_limit:
-                logger.warning(
-                    "%r event buffer full (limit is %d), dropping event", self.__class__.__name__, self._buffer_limit
-                )
-                return
-            self._buffer.extend(events)
-            self.buffer_size += len(safe_json(events))
+        # Split incoming events into normal vs experiment spans
+        norm_events = []
+        exp_events = []
+        for e in events:
+            if e.get("meta", {}).get("span.kind") == "experiment":
+                exp_events.append(e)
+            else:
+                norm_events.append(e)
+
+        # Add normal spans to main buffer
+        if norm_events:
+            with self._lock:
+                if len(self._buffer) + len(norm_events) > self._buffer_limit:
+                    logger.warning("Dropping normal spans: buffer limit reached")
+                    return
+                self._buffer.extend(norm_events)
+                self.buffer_size += len(safe_json(norm_events))
+
+        # Add experiment spans to separate buffer
+        if exp_events:
+            with self._lock:
+                if len(self._experiment_buffer) + len(exp_events) > self._buffer_limit:
+                    logger.warning("Dropping experiment spans: buffer limit reached")
+                    return
+                self._experiment_buffer.extend(exp_events)
+                self.buffer_size += len(safe_json(exp_events))
 
     def encode(self):
+        """Encode only the normal spans for standard flush"""
         with self._lock:
             if not self._buffer:
                 return None, 0
             events = self._buffer
+            
+            # Save experiment buffer before _init_buffer() clears it
+            experiment_spans = self._experiment_buffer
             self._init_buffer()
-        data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events}
+
+        data = {
+            "_dd.stage": "raw",
+            "_dd.tracer_version": ddtrace.__version__,
+            "event_type": "span",
+            "spans": events
+        }
+
+        if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")):
+            data["_dd.scope"] = "experiments"
+
+        try:
+            enc_data = safe_json(data)
+            if isinstance(enc_data, str):
+                enc_data = enc_data.encode('utf-8')
+            logger.debug("encode %d LLMObs span events", len(events))
+        except TypeError:
+            logger.error("failed to encode LLMObs span events", exc_info=True)
+            return None, 0
+
+        # Restore experiment buffer
+        with self._lock:
+            self._experiment_buffer = experiment_spans
+
+        return enc_data, len(events)
+
+    def encode_experiment_spans(self):
+        """Encode only the experiment spans for separate request"""
+        with self._lock:
+            if not self._experiment_buffer:
+                return None, 0
+            exp_events = self._experiment_buffer
+            self._experiment_buffer = []
+
+        data = {
+            "_dd.stage": "raw",
+            "_dd.tracer_version": ddtrace.__version__,
+            "event_type": "experiment-span",
+            "experiment_spans": exp_events
+        }
+
+        if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")):
+            data["_dd.scope"] = "experiments"
+
         try:
-            enc_llm_events = safe_json(data)
-            if isinstance(enc_llm_events, str):
-                enc_llm_events = enc_llm_events.encode('utf-8')
-            logger.debug("encode %d LLMObs span events to be sent", len(events))
+            enc_data = safe_json(data)
+            if isinstance(enc_data, str):
+                enc_data = enc_data.encode('utf-8')
+            logger.debug("encode %d LLMObs experiment span events", len(exp_events))
         except TypeError:
-            logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True)
+            logger.error("failed to encode LLMObs experiment span events", exc_info=True)
             return None, 0
-        return enc_llm_events, len(events)
+
+        return enc_data, len(exp_events)
 
 
 class LLMObsEventClient(WriterClientBase):
@@ -310,6 +378,35 @@ def recreate(self):
             is_agentless=config._llmobs_agentless_enabled,
         )
 
+    def periodic(self) -> None:
+        # First flush normal spans using parent logic
+        super(LLMObsSpanWriter, self).periodic()
+
+        # Then flush experiment spans in a separate request
+        for client in self._clients:
+            if isinstance(client, LLMObsEventClient) and isinstance(client.encoder, LLMObsSpanEncoder):
+                encoded, count = client.encoder.encode_experiment_spans()
+                if not encoded or not count:
+                    continue
+
+                try:
+                    print("Sending experiment spans")
+                    print(encoded)
+                    self._send_payload_with_backoff(encoded, count, client)
+                except Exception:
+                    self._metrics_dist("http.errors", tags=["type:err"])
+                    self._metrics_dist("http.dropped.bytes", len(encoded))
+                    self._metrics_dist("http.dropped.traces", count)
+                    logger.error(
+                        "failed to send %d experiment spans to %s",
+                        count,
+                        self.intake_url,
+                        exc_info=True
+                    )
+                else:
+                    self._metrics_dist("http.sent.bytes", len(encoded))
+                    self._metrics_dist("http.sent.traces", count)
+
 
 def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent:
     event["meta"]["input"] = {"value": DROPPED_VALUE_TEXT}

From 1b0800b3817974324f3efc4cc18e1014002bd194 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Thu, 30 Jan 2025 12:14:06 -0500
Subject: [PATCH 34/36] switch trace ingestion path

---
 ddtrace/llmobs/_experiments.py | 488 +++++++++++++++++++--------------
 ddtrace/llmobs/_llmobs.py      |   3 -
 ddtrace/llmobs/_writer.py      |   2 -
 3 files changed, 281 insertions(+), 212 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 835855f5d5b..d6d72284a6f 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -170,6 +170,7 @@ def pull(cls, name: str) -> "Dataset":
             expected_output = attrs.get("expected_output")
                 
             class_records.append({
+                "record_id": record.get("id"),
                 "input": input_data,
                 "expected_output": expected_output,
                 **attrs.get("metadata", {}),
@@ -182,16 +183,10 @@ def pull(cls, name: str) -> "Dataset":
         return dataset
 
     def push(self, chunk_size: int = 300) -> None:
-        """Push the dataset to Datadog.
+        """Push the dataset to Datadog and refresh with pulled data.
 
         Args:
             chunk_size: Number of records to upload in each chunk. Defaults to 300.
-
-        Returns:
-            Dict[str, Any]: Dictionary containing dataset information including:
-                - dataset_id: The ID of the created/updated dataset
-                - dataset_name: The name of the dataset
-                - record_count: Number of records uploaded
         """
         # Check if dataset exists
         encoded_name = quote(self.name)
@@ -245,6 +240,12 @@ def push(self, chunk_size: int = 300) -> None:
             if show_progress:
                 _print_progress_bar(i + 1, total_chunks, prefix='Uploading:', suffix='Complete')
 
+        # Pull the dataset to get all record IDs and metadata
+        pulled_dataset = self.pull(self.name)
+        self._data = pulled_dataset._data
+        self._datadog_dataset_id = pulled_dataset._datadog_dataset_id
+        self._datadog_dataset_version = pulled_dataset._datadog_dataset_version
+
         # Print url to the dataset in Datadog
         print(f"\nDataset '{self.name}' created: {BASE_URL}/llm/experiments/datasets/{dataset_id}\n")
 
@@ -504,14 +505,16 @@ def __init__(
         self.metadata = metadata
         self.config = config
 
-        # Enforce that the task function has the @task decorator
-        if not hasattr(self.task, '_is_task'):
+        # Make sure the task is decorated with @task
+        if not hasattr(self.task, "_is_task"):
             raise TypeError("Task function must be decorated with @task decorator.")
 
-        # Enforce that all evaluators have the @evaluator decorator
+        # Make sure every evaluator is decorated with @evaluator
         for evaluator_func in self.evaluators:
-            if not hasattr(evaluator_func, '_is_evaluator'):
-                raise TypeError(f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator.")
+            if not hasattr(evaluator_func, "_is_evaluator"):
+                raise TypeError(
+                    f"Evaluator '{evaluator_func.__name__}' must be decorated with @evaluator decorator."
+                )
 
         # Post-run attributes
         self.has_run = False
@@ -519,98 +522,228 @@ def __init__(
         self.outputs = []
         self.evaluations = []
 
-    def run_task(
+        # We'll store the experiment's Datadog ID once it's created.
+        self._datadog_experiment_id: Optional[str] = None
+        self._datadog_project_id: Optional[str] = None
+
+    def _get_or_create_project(self) -> str:
+        """
+        Internal helper to retrieve or create a project in Datadog, returning the project_id.
+        """
+        url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.project_name}"
+        resp = exp_http_request("GET", url)
+        response_data = resp.json()
+        projects = response_data.get("data", [])
+
+        if not projects:
+            # Create new project
+            project_payload = {
+                "data": {
+                    "type": "projects",
+                    "attributes": {
+                        "name": self.project_name,
+                        "description": "",
+                        "metadata": {"team": "ml-obs"},
+                    },
+                }
+            }
+            resp = exp_http_request(
+                "POST",
+                "/api/unstable/llm-obs/v1/projects",
+                body=json.dumps(project_payload).encode("utf-8"),
+            )
+            response_data = resp.json()
+            return response_data["data"]["id"]
+        else:
+            return projects[0]["id"]
+
+    def _create_experiment_in_datadog(self) -> str:
+        """
+        Internal helper to create an experiment in Datadog, returning the new experiment_id.
+        Raises ValueError if the dataset hasn't been pushed (no _datadog_dataset_id).
+        """
+        if not self.dataset._datadog_dataset_id:
+            raise ValueError(
+                "Dataset must be pushed to Datadog (so it has an ID) before creating an experiment. "
+                "Please call dataset.push() first."
+            )
+
+        project_id = self._get_or_create_project()
+
+        experiment_payload = {
+            "data": {
+                "type": "experiments",
+                "attributes": {
+                    "name": self.name,
+                    "description": self.description,
+                    "dataset_id": self.dataset._datadog_dataset_id,
+                    "project_id": project_id,
+                    "dataset_version": self.dataset._datadog_dataset_version,
+                    "metadata": {
+                        "tags": self.tags,
+                        **(self.metadata or {}),
+                        "config": self.config,
+                    },
+                    "ensure_unique": True,
+                },
+            }
+        }
+        resp = exp_http_request(
+            "POST",
+            "/api/unstable/llm-obs/v1/experiments",
+            body=json.dumps(experiment_payload).encode("utf-8"),
+        )
+        response_data = resp.json()
+        experiment_id = response_data["data"]["id"]
+
+        # The API may rename the experiment (e.g., adding a suffix), so update local name:
+        self.name = response_data["data"]["attributes"]["name"]
+        return experiment_id
+
+    def run(
         self,
         _jobs: int = 10,
         raise_errors: bool = False,
-    ) -> None:
-        """Execute the task function on the dataset and store the outputs.
+    ) -> "ExperimentResults":
+        """
+        Execute the task and evaluations, returning the results.
+        Here, we guarantee an experiment is created first,
+        so run_task() can tag traces with the real experiment ID.
+        """
+        print("Running experiment...")
+        # 1) Make sure the dataset is pushed
+        if not self.dataset._datadog_dataset_id:
+            raise ValueError(
+                "Dataset must be pushed to Datadog before running the experiment."
+            )
 
-        Args:
-            _jobs: Number of concurrent jobs to run (between 1-20). Defaults to 10.
-            raise_errors: If True, raises exceptions from failed tasks. If False, stores
-                          errors in the output. Defaults to False.
+        # 2) Create project + experiment if this hasn't been done yet
+        if not self._datadog_experiment_id:
+            project_id = self._get_or_create_project()  # your existing helper
+            self._datadog_project_id = project_id
+            
+            experiment_id = self._create_experiment_in_datadog()  # your existing helper
+            self._datadog_experiment_id = experiment_id
 
-        Raises:
-            ValueError: If _jobs is not between 1 and 30
+        # 3) Now run the task and evaluations
+        self.run_task(_jobs=_jobs, raise_errors=raise_errors)
+        experiment_results = self.run_evaluations(raise_errors=raise_errors)
+        return experiment_results
+
+    def run_task(
+        self,
+        _jobs: int = 10,
+        raise_errors: bool = False,
+    ) -> None:
+        """
+        Execute the task function on the dataset and store the outputs.
+        The caller (run()) ensures that self._datadog_experiment_id is set first.
         """
         os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True"
         if not 1 <= _jobs <= 30:
             raise ValueError("Number of jobs must be between 1 and 30")
-        
-        def instrumented_task(input_data, expected_output, config=None): 
+
+        def instrumented_task(
+            record_id: str, input_data: Any, expected_output: Any, config: Optional[Dict[str, Any]] = None
+        ):
             with LLMObs._experiment(name="experiment-task") as span:
                 span.context.set_baggage_item("is_experiment_task", True)
                 output = self.task(input_data, config)
-                # LLMObs._tag_expected_output(span, expected_output)
-                LLMObs.annotate(span, input_data=input_data, output_data=output)
-                return output
-        
+                LLMObs.annotate(
+                    span,
+                    input_data=input_data,
+                    output_data=output,
+                    tags={
+                        "dataset_id": self.dataset._datadog_dataset_id,
+                        "dataset_record_id": record_id,
+                        "experiment_id": self._datadog_experiment_id,
+
+                    },
+                )
+                LLMObs._tag_expected_output(span, expected_output)
+                return (output, span)
+
         self.outputs = []
         total_rows = len(self.dataset)
         completed = 0
-        error_count = 0 
+        error_count = 0
 
         def process_row(idx_row):
             idx, row = idx_row
             start_time = time.time()
-            
-            try:
-                input_data = row['input']
-                expected_output = row['expected_output']
-                
-                if getattr(self.task, '_accepts_config', False):
-                    output = instrumented_task(input_data, expected_output, self.config)
-                else:
-                    output = instrumented_task(input_data, expected_output)
-                
-                # Periodic flush every 10 rows (approximate because it's concurrent)
-                if idx % 10 == 0:
-                    LLMObs.flush()
-                
-                output_data = {
-                    "idx": idx,
-                    "output": output,
-                    "metadata": {
-                        "timestamp": start_time,
-                        "duration": time.time() - start_time,
-                        "dataset_record_idx": idx,
-                        "project_name": self.project_name,
-                        "experiment_name": self.name,
-                        "dataset_name": self.dataset.name,
-                    },
-                    "error": {
-                        "message": None,
-                        "stack": None,
-                        "type": None,
+
+            with LLMObs._experiment(name="experiment-task") as span:
+                span.context.set_baggage_item("is_experiment_task", True)
+                try:
+                    input_data = row["input"]
+                    expected_output = row["expected_output"]
+
+                    if getattr(self.task, "_accepts_config", False):
+                        output = self.task(input_data, self.config)
+                    else:
+                        output = self.task(input_data)
+
+                    # Periodic flush for concurrency
+                    if idx % 10 == 0:
+                        LLMObs.flush()
+
+                    LLMObs.annotate(
+                        span,
+                        input_data=input_data,
+                        output_data=output,
+                        tags={
+                            "dataset_id": self.dataset._datadog_dataset_id,
+                            "dataset_record_id": row["record_id"],
+                            "experiment_id": self._datadog_experiment_id,
+                        },
+                    )
+                    LLMObs._tag_expected_output(span, expected_output)
+
+                    return {
+                        "idx": idx,
+                        "output": output,
+                        "metadata": {
+                            "timestamp": start_time,
+                            "duration": time.time() - start_time,
+                            "dataset_record_index": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                            "span_id": span.span_id,
+                            "trace_id": span.trace_id,
+                        },
+                        "error": {"message": None, "stack": None, "type": None},
                     }
-                }
-                return output_data
 
-            except Exception as e:
-                error_message = str(e)
-                return {
-                    "idx": idx,
-                    "output": None,
-                    "metadata": {
-                        "timestamp": start_time,
-                        "duration": time.time() - start_time,
-                        "dataset_record_idx": idx,
-                        "project_name": self.project_name,
-                        "experiment_name": self.name,
-                        "dataset_name": self.dataset.name,
-                    },
-                    "error": {
-                        "message": error_message,
-                        "stack": traceback.format_exc(),
-                        "type": type(e).__name__,
+                except Exception as e:
+                    error_message = str(e)
+                    return {
+                        "idx": idx,
+                        "output": None,
+                        "metadata": {
+                            "timestamp": start_time,
+                            "duration": time.time() - start_time,
+                            "dataset_record_index": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                            "span_id": span.span_id,
+                            "trace_id": span.trace_id,
+                        },
+                        "error": {
+                            "message": error_message,
+                            "stack": traceback.format_exc(),
+                            "type": type(e).__name__,
+                        }
                     }
-                }
 
         _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            futures = {executor.submit(process_row, (idx, row)): idx for idx, row in enumerate(self.dataset)}
+            futures = {
+                executor.submit(process_row, (idx, row)): idx
+                for idx, row in enumerate(self.dataset)
+            }
             outputs_buffer = [None] * total_rows
 
             try:
@@ -619,11 +752,13 @@ def process_row(idx_row):
                     try:
                         output_data = future.result()
                         outputs_buffer[idx] = output_data
-                        if raise_errors and output_data['error']['message']:
-                            error_message = output_data['error']['message']
-                            raise ExperimentTaskError(error_message, idx, output_data['error']['type'])
-                            
-                        elif output_data['error']['message']:
+                        if raise_errors and output_data["error"]["message"]:
+                            error_message = output_data["error"]["message"]
+                            raise ExperimentTaskError(
+                                error_message, idx, output_data["error"]["type"]
+                            )
+
+                        elif output_data["error"]["message"]:
                             error_count += 1
 
                     except Exception as e:
@@ -633,16 +768,18 @@ def process_row(idx_row):
                             "metadata": {
                                 "timestamp": time.time(),
                                 "duration": 0,
-                                "dataset_record_idx": idx,
+                                "dataset_record_index": idx,
                                 "project_name": self.project_name,
                                 "experiment_name": self.name,
                                 "dataset_name": self.dataset.name,
+                                "span_id": span.span_id,
+                                "trace_id": span.trace_id,
                             },
                             "error": {
                                 "message": str(e),
                                 "stack": traceback.format_exc(),
                                 "type": type(e).__name__,
-                            }
+                            },
                         }
                         if raise_errors:
                             raise e
@@ -650,7 +787,7 @@ def process_row(idx_row):
                             error_count += 1
 
                     completed += 1
-                    _print_progress_bar(completed, total_rows, prefix='Processing:', suffix='Complete')
+                    _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete")
 
             except Exception as e:
                 for future in futures:
@@ -660,8 +797,7 @@ def process_row(idx_row):
 
         self.outputs = outputs_buffer
         self.has_run = True
-        
-        # Final flush at the end
+
         LLMObs.flush()
 
         error_rate = (error_count / total_rows) * 100
@@ -669,9 +805,16 @@ def process_row(idx_row):
         os.environ["DD_LLMOBS_ENABLED"] = "False"
         print(f"Task completed with {error_count} errors ({error_rate:.2f}% error rate)")
         if error_count > 0:
-            print("If you'd like to halt execution on errors and see the full traceback, set `raise_errors=True` when running the experiment.")
+            print(
+                "If you'd like to halt execution on errors and see the full traceback, "
+                "set `raise_errors=True` when running the experiment."
+            )
 
-    def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_errors: bool = False) -> "ExperimentResults":
+    def run_evaluations(
+        self,
+        evaluators: Optional[List[Callable]] = None,
+        raise_errors: bool = False
+    ) -> "ExperimentResults":
         """Run evaluators on the outputs and return ExperimentResults.
         
         Args:
@@ -753,26 +896,7 @@ def run_evaluations(self, evaluators: Optional[List[Callable]] = None, raise_err
         self.has_evaluated = True
         return ExperimentResults(self.dataset, self, self.outputs, evaluations)
 
-    def run(
-        self,
-        _jobs: int = 10,
-        raise_errors: bool = False,
-    ) -> "ExperimentResults":
-        """Execute the task and evaluations, returning the results.
-
-        Args:
-            _jobs (int): Number of worker threads.
-            timeout (float, optional): Time limit for the task execution in seconds.
-            raise_errors (bool): If True, raises exceptions from failed tasks. If False, stores
-                                errors in the output. Defaults to False.
-
-        Returns:
-            ExperimentResults: The results of the experiment.
-        """
-        self.run_task(_jobs=_jobs, raise_errors=raise_errors)
-        experiment_results = self.run_evaluations(raise_errors=raise_errors)
-        return experiment_results
-
+ 
 
 class ExperimentResults:
     """Contains and manages the results of an experiment run.
@@ -808,6 +932,7 @@ def _merge_results(self) -> List[Dict[str, Any]]:
 
             merged_result = {
                 "idx": idx,
+                "record_id": dataset_record.get('record_id'),
                 "input": dataset_record.get('input', {}),
                 "expected_output": dataset_record.get('expected_output', {}),
                 "output": output_data.get('output'),
@@ -901,80 +1026,43 @@ def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         return final_df
 
     def push(self, chunk_size: int = 300) -> None:
-        """Push the experiment results to Datadog.
-
-        Args:
-            chunk_size: Number of records to upload in each chunk. Defaults to 300.
-
-        Raises:
-            ValueError: If the dataset hasn't been pushed to Datadog first
         """
+        Push the experiment results to Datadog, without re-creating the project/experiment.
+        Assumes self.experiment._datadog_experiment_id and self.experiment._datadog_project_id
+        have already been set in Experiment.run().
+        """
+        # Ensure the dataset is hosted in Datadog
         if not self.experiment.dataset._datadog_dataset_id:
             raise ValueError(
                 "Dataset has not been pushed to Datadog. "
                 "Please call dataset.push() before pushing experiment results."
             )
 
-        # Check if project exists
-        url = f"/api/unstable/llm-obs/v1/projects?filter[name]={self.experiment.project_name}"
-        resp = exp_http_request("GET", url)
-        response_data = resp.json()
-        projects = response_data.get("data", [])
-        if not projects:
-            # Create new project
-            project_payload = {
-                "data": {
-                    "type": "projects",
-                    "attributes": {
-                        "name": self.experiment.project_name,
-                        "description": "",
-                        "metadata": {"team": "ml-obs"},
-                    },
-                }
-            }
-            resp = exp_http_request(
-                "POST",
-                "/api/unstable/llm-obs/v1/projects",
-                body=json.dumps(project_payload).encode("utf-8"),
+        # Ensure the experiment was already created (via run())
+        if not self.experiment._datadog_experiment_id:
+            raise ValueError(
+                "Experiment has not been created in Datadog. "
+                "Please call experiment.run() before pushing results."
             )
-            response_data = resp.json()
-            project_id = response_data["data"]["id"]
-        else:
-            project_id = projects[0]["id"]
 
-        # Create new experiment
-        experiment_payload = {
-            "data": {
-                "type": "experiments",
-                "attributes": {
-                    "name": self.experiment.name,
-                    "description": self.experiment.description,
-                    "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                    "project_id": project_id,
-                    "dataset_version": self.experiment.dataset._datadog_dataset_version,
-                    "metadata": {
-                        "tags": self.experiment.tags,
-                        **(self.experiment.metadata or {}),
-                        "config": self.experiment.config,
-                    },
-                    "ensure_unique": True, # Generates a new experiment with a unique name if the experiment name already exists
-                },
-            }
-        }
-        resp = exp_http_request(
-            "POST", "/api/unstable/llm-obs/v1/experiments", body=json.dumps(experiment_payload).encode("utf-8")
-        )
-        response_data = resp.json()
-        experiment_id = response_data["data"]["id"]
-        self.experiment.name = response_data["data"]["attributes"]["name"]
+        # Grab IDs from the already-created experiment
+        experiment_id = self.experiment._datadog_experiment_id
+        project_id = self.experiment._datadog_project_id
+        experiment_name = self.experiment.name
+
+        # Now proceed with chunked uploading of your results — no project or experiment creation here.
 
-        # Process results in chunks
         total_results = len(self.merged_results)
-        chunks = [self.merged_results[i:i + chunk_size] for i in range(0, total_results, chunk_size)]
+        # Optional progress bar
+        show_progress = total_results > chunk_size
+
+        # Just an example of how you'd do chunked uploads:
+        chunks = [
+            self.merged_results[i : i + chunk_size]
+            for i in range(0, total_results, chunk_size)
+        ]
         total_chunks = len(chunks)
 
-        # Only show progress bar for large result sets
-        show_progress = total_results > chunk_size
         if show_progress:
             print(f"\nUploading {total_results} results in {total_chunks} chunks...")
             _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete')
@@ -988,39 +1076,15 @@ def push(self, chunk_size: int = 300) -> None:
                 idx = result['idx']
                 merged_result = result
                 output = merged_result.get('output')
+                record_id = merged_result.get('record_id')
                 input = merged_result.get('input', {})
                 evaluations = merged_result.get('evaluations', {})
                 expected_output = merged_result.get('expected_output', {})
-                metadata = merged_result.get('metadata', {})
                 error = merged_result.get('error', {})
+                metadata = merged_result.get('metadata', {})
+                span_id = metadata.get('span_id')
+                trace_id = metadata.get('trace_id')
 
-                # When the dataset is not hosted, we use the hash of the input and expected output as the dataset record id
-                dataset_record_id = hashlib.md5((str(input) + str(expected_output)).encode('utf-8')).hexdigest()
-
-                span = {
-                    "span_id": _make_id(),
-                    "project_id": project_id,
-                    "experiment_id": experiment_id,
-                    "dataset_id": self.experiment.dataset._datadog_dataset_id,
-                    #TODO: Extract the record id from the dataset for hosted datasets
-                    "dataset_record_id": dataset_record_id,
-                    "start_ns": int(metadata.get("timestamp", time.time()) * 1e9),
-                    "duration": float(metadata.get("duration", 0) * 1e9),
-                    "status": "ok" if not error else "error",
-                    "metrics": {},  # TODO: Fill in with actual metrics once we have tracing and llm spans
-                    "meta": {
-                        "span": {"kind": "experiment-result"},
-                        "input": merged_result.get('input', {}),
-                        "output": output,
-                        "expected_output": merged_result.get('expected_output', {}),
-                        "error": {
-                            "message": error.get("message"),
-                            "type": error.get("type"),
-                            "stack": error.get("stack"),
-                        }
-                    },
-                }
-                spans.append(span)
 
                 # Add evaluation metrics
                 for metric_payload_name, metric_payload_value in evaluations.items():
@@ -1047,33 +1111,43 @@ def push(self, chunk_size: int = 300) -> None:
 
 
                     metric = {
-                        "span_id": span["span_id"],
+                        "span_id": str(span_id),
+                        "trace_id": str(trace_id),
                         "metric_type": metric_type,
                         "timestamp_ms": timestamp_ms,
                         "label": metric_payload_name,
                         "score_value" if metric_type == "score" else "categorical_value": metric_value,
                         "error": metric_payload_value["error"],
+                        "join_on": {
+                            "span": {
+                                "trace_id": str(trace_id),
+                                "span_id": str(span_id),
+                            },
+                        }
                     }
 
                     metrics.append(metric)
+                    
 
             # Prepare and send chunk payload
             chunk_payload = {
                 "data": {
-                    "type": "experiments",
-                    "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__],
-                    "attributes": {"spans": [], "metrics": []} #metrics}, #TODO: Remove this whole thing since experiment spans results will be part of tracing
+                    "type": "evaluation_metric",
+                    "attributes": {"scope": "experiments", "metrics": metrics, "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__, "experiment_id:" + experiment_id]}, 
                 }
             }
 
-            url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
+            print("chunk_payload: ", chunk_payload)
+
+            url = f"/api/intake/llm-obs/v2/eval-metric"
             exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8"))
 
             if show_progress:
-                _print_progress_bar(chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete')
+                _print_progress_bar(
+                    chunk_idx + 1, total_chunks, prefix='Uploading:', suffix='Complete'
+                )
 
-        # Print URL to the experiment in Datadog
-        print(f"\nExperiment '{self.experiment.name}' created: {BASE_URL}/llm/experiments/experiment-list/{experiment_id}\n")
+        print(f"\nExperiment '{experiment_name}' results pushed to Datadog.\n")
 
     def export_to_jsonl(self, file_path):
         """
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 0da238f3d0d..ef5d96ffa88 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -813,9 +813,6 @@ def _tag_expected_output(span, expected_output: dict) -> None:
         """Tags a given LLMObs span with a prompt"""
         try:
             span._set_ctx_item(EXPECTED_OUTPUT, expected_output)
-            print("added expected output")
-            print("expected output: ", span._get_ctx_item(EXPECTED_OUTPUT))
-            print("span: ", span)
         except TypeError:
             log.warning("Failed to validate expected output with error: ", exc_info=True)
             return
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index b8a2756d22c..7f6be1d6fd4 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -390,8 +390,6 @@ def periodic(self) -> None:
                     continue
 
                 try:
-                    print("Sending experiment spans")
-                    print(encoded)
                     self._send_payload_with_backoff(encoded, count, client)
                 except Exception:
                     self._metrics_dist("http.errors", tags=["type:err"])

From b8a247259c1c79e13848fedd99c01576fe230a90 Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Wed, 5 Feb 2025 16:46:09 -0500
Subject: [PATCH 35/36] tracing stable

---
 .../contrib/internal/requests/connection.py   |   4 -
 ddtrace/llmobs/_constants.py                  |   2 +
 ddtrace/llmobs/_experiments.py                | 194 ++++++------------
 ddtrace/llmobs/_integrations/base.py          |   3 +-
 ddtrace/llmobs/_llmobs.py                     |  19 ++
 ddtrace/llmobs/_writer.py                     | 124 ++---------
 6 files changed, 104 insertions(+), 242 deletions(-)

diff --git a/ddtrace/contrib/internal/requests/connection.py b/ddtrace/contrib/internal/requests/connection.py
index d7a19ec6eb0..06d3347f0a1 100644
--- a/ddtrace/contrib/internal/requests/connection.py
+++ b/ddtrace/contrib/internal/requests/connection.py
@@ -102,11 +102,7 @@ def _wrap_send(func, instance, args, kwargs):
             span.set_tag(_ANALYTICS_SAMPLE_RATE_KEY, cfg.get("analytics_sample_rate", True))
 
         # propagate distributed tracing headers
-        # breakpoint()
         if cfg.get("distributed_tracing"):
-            # breakpoint()
-            print("propagating headers")
-            print(span.context)
             HTTPPropagator.inject(span.context, request.headers)
 
         response = response_headers = None
diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
index 05f3b599664..d060f8173f7 100644
--- a/ddtrace/llmobs/_constants.py
+++ b/ddtrace/llmobs/_constants.py
@@ -16,6 +16,8 @@
 INPUT_PARAMETERS = "_ml_obs.meta.input.parameters"
 INPUT_PROMPT = "_ml_obs.meta.input.prompt"
 EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
+EXPERIMENT_INPUT = "_ml_obs.meta.input"
+EXPERIMENT_OUTPUT = "_ml_obs.meta.output"
 OUTPUT_DOCUMENTS = "_ml_obs.meta.output.documents"
 OUTPUT_MESSAGES = "_ml_obs.meta.output.messages"
 OUTPUT_VALUE = "_ml_obs.meta.output.value"
diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index d6d72284a6f..335c0ff4e30 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -229,7 +229,6 @@ def push(self, chunk_size: int = 300) -> None:
         # Only show progress bar for large datasets
         show_progress = total_records > chunk_size
         if show_progress:
-            print(f"\nUploading {total_records} records in {total_chunks} chunks...")
             _print_progress_bar(0, total_chunks, prefix='Uploading:', suffix='Complete')
 
         for i, chunk in enumerate(chunks):
@@ -602,7 +601,7 @@ def _create_experiment_in_datadog(self) -> str:
 
     def run(
         self,
-        _jobs: int = 10,
+        jobs: int = 10,
         raise_errors: bool = False,
     ) -> "ExperimentResults":
         """
@@ -626,55 +625,24 @@ def run(
             self._datadog_experiment_id = experiment_id
 
         # 3) Now run the task and evaluations
-        self.run_task(_jobs=_jobs, raise_errors=raise_errors)
+        self.run_task(_jobs=jobs, raise_errors=raise_errors)
         experiment_results = self.run_evaluations(raise_errors=raise_errors)
         return experiment_results
 
-    def run_task(
-        self,
-        _jobs: int = 10,
-        raise_errors: bool = False,
-    ) -> None:
+    def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None:
         """
-        Execute the task function on the dataset and store the outputs.
-        The caller (run()) ensures that self._datadog_experiment_id is set first.
+        Execute the task function on the dataset concurrently using ThreadPoolExecutor.map,
+        updating progress via _print_progress_bar and processing more rows in parallel.
         """
         os.environ["DD_EXPERIMENTS_RUNNER_ENABLED"] = "True"
-        if not 1 <= _jobs <= 30:
-            raise ValueError("Number of jobs must be between 1 and 30")
-
-        def instrumented_task(
-            record_id: str, input_data: Any, expected_output: Any, config: Optional[Dict[str, Any]] = None
-        ):
-            with LLMObs._experiment(name="experiment-task") as span:
-                span.context.set_baggage_item("is_experiment_task", True)
-                output = self.task(input_data, config)
-                LLMObs.annotate(
-                    span,
-                    input_data=input_data,
-                    output_data=output,
-                    tags={
-                        "dataset_id": self.dataset._datadog_dataset_id,
-                        "dataset_record_id": record_id,
-                        "experiment_id": self._datadog_experiment_id,
-
-                    },
-                )
-                LLMObs._tag_expected_output(span, expected_output)
-                return (output, span)
-
-        self.outputs = []
         total_rows = len(self.dataset)
-        completed = 0
-        error_count = 0
 
         def process_row(idx_row):
             idx, row = idx_row
             start_time = time.time()
-
-            with LLMObs._experiment(name="experiment-task") as span:
-                span.context.set_baggage_item("is_experiment_task", True)
-                try:
+            try:
+                with LLMObs._experiment(name=self.task.__name__) as span:
+                    span.context.set_baggage_item("is_experiment_task", True)
                     input_data = row["input"]
                     expected_output = row["expected_output"]
 
@@ -684,7 +652,7 @@ def process_row(idx_row):
                         output = self.task(input_data)
 
                     # Periodic flush for concurrency
-                    if idx % 10 == 0:
+                    if idx % 30 == 0:
                         LLMObs.flush()
 
                     LLMObs.annotate(
@@ -699,6 +667,10 @@ def process_row(idx_row):
                     )
                     LLMObs._tag_expected_output(span, expected_output)
 
+                    span_context = LLMObs.export_span(span=span)
+                    span_id = span_context["span_id"]
+                    trace_id = span_context["trace_id"]
+
                     return {
                         "idx": idx,
                         "output": output,
@@ -709,92 +681,56 @@ def process_row(idx_row):
                             "project_name": self.project_name,
                             "experiment_name": self.name,
                             "dataset_name": self.dataset.name,
-                            "span_id": span.span_id,
-                            "trace_id": span.trace_id,
+                            "span_id": span_id,
+                            "trace_id": trace_id,
                         },
                         "error": {"message": None, "stack": None, "type": None},
                     }
-
-                except Exception as e:
-                    error_message = str(e)
-                    return {
-                        "idx": idx,
-                        "output": None,
-                        "metadata": {
-                            "timestamp": start_time,
-                            "duration": time.time() - start_time,
-                            "dataset_record_index": idx,
-                            "project_name": self.project_name,
-                            "experiment_name": self.name,
-                            "dataset_name": self.dataset.name,
-                            "span_id": span.span_id,
-                            "trace_id": span.trace_id,
-                        },
-                        "error": {
-                            "message": error_message,
-                            "stack": traceback.format_exc(),
-                            "type": type(e).__name__,
-                        }
+            except Exception as e:
+                error_message = str(e)
+                # In case of an exception, span_id and trace_id are set to None
+                return {
+                    "idx": idx,
+                    "output": None,
+                    "metadata": {
+                        "timestamp": start_time,
+                        "duration": time.time() - start_time,
+                        "dataset_record_index": idx,
+                        "project_name": self.project_name,
+                        "experiment_name": self.name,
+                        "dataset_name": self.dataset.name,
+                        "span_id": None,
+                        "trace_id": None,
+                    },
+                    "error": {
+                        "message": error_message,
+                        "stack": traceback.format_exc(),
+                        "type": type(e).__name__,
                     }
+                }
 
-        _print_progress_bar(0, total_rows, prefix='Processing:', suffix='Complete')
+        outputs_buffer = []
+        completed = 0
 
+        # Using ThreadPoolExecutor.map to process rows concurrently
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
-            futures = {
-                executor.submit(process_row, (idx, row)): idx
-                for idx, row in enumerate(self.dataset)
-            }
-            outputs_buffer = [None] * total_rows
-
-            try:
-                for future in concurrent.futures.as_completed(futures):
-                    idx = futures[future]
-                    try:
-                        output_data = future.result()
-                        outputs_buffer[idx] = output_data
-                        if raise_errors and output_data["error"]["message"]:
-                            error_message = output_data["error"]["message"]
-                            raise ExperimentTaskError(
-                                error_message, idx, output_data["error"]["type"]
-                            )
-
-                        elif output_data["error"]["message"]:
-                            error_count += 1
-
-                    except Exception as e:
-                        outputs_buffer[idx] = {
-                            "idx": idx,
-                            "output": None,
-                            "metadata": {
-                                "timestamp": time.time(),
-                                "duration": 0,
-                                "dataset_record_index": idx,
-                                "project_name": self.project_name,
-                                "experiment_name": self.name,
-                                "dataset_name": self.dataset.name,
-                                "span_id": span.span_id,
-                                "trace_id": span.trace_id,
-                            },
-                            "error": {
-                                "message": str(e),
-                                "stack": traceback.format_exc(),
-                                "type": type(e).__name__,
-                            },
-                        }
-                        if raise_errors:
-                            raise e
-                        else:
-                            error_count += 1
-
-                    completed += 1
-                    _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete")
-
-            except Exception as e:
-                for future in futures:
-                    future.cancel()
-                executor.shutdown(wait=False)
-                raise e
+            # executor.map returns results in order, so we iterate and update our progress
+            for result in executor.map(process_row, list(enumerate(self.dataset))):
+                outputs_buffer.append(result)
+                completed += 1
+                _print_progress_bar(completed, total_rows, prefix="Processing:", suffix="Complete")
 
+        # Check for errors and raise if required
+        error_count = 0
+        for idx, output_data in enumerate(outputs_buffer):
+            if output_data["error"]["message"]:
+                error_count += 1
+                if raise_errors:
+                    raise ExperimentTaskError(
+                        output_data["error"]["message"],
+                        idx,
+                        output_data["error"]["type"]
+                    )
         self.outputs = outputs_buffer
         self.has_run = True
 
@@ -1110,6 +1046,7 @@ def push(self, chunk_size: int = 300) -> None:
                         metric_value = str(metric_payload_value["value"])
 
 
+                
                     metric = {
                         "span_id": str(span_id),
                         "trace_id": str(trace_id),
@@ -1118,28 +1055,25 @@ def push(self, chunk_size: int = 300) -> None:
                         "label": metric_payload_name,
                         "score_value" if metric_type == "score" else "categorical_value": metric_value,
                         "error": metric_payload_value["error"],
-                        "join_on": {
-                            "span": {
-                                "trace_id": str(trace_id),
-                                "span_id": str(span_id),
-                            },
-                        }
+                        # "join_on": {
+                        #     "span": {
+                        #         "trace_id": str(trace_id),
+                        #         "span_id": str(span_id),
+                        #     },
+                        # }
                     }
 
                     metrics.append(metric)
                     
 
-            # Prepare and send chunk payload
             chunk_payload = {
                 "data": {
-                    "type": "evaluation_metric",
+                    "type": "experiments",
                     "attributes": {"scope": "experiments", "metrics": metrics, "tags": self.experiment.tags + ["ddtrace.version:" + ddtrace.__version__, "experiment_id:" + experiment_id]}, 
                 }
             }
 
-            print("chunk_payload: ", chunk_payload)
-
-            url = f"/api/intake/llm-obs/v2/eval-metric"
+            url = f"/api/unstable/llm-obs/v1/experiments/{experiment_id}/events"
             exp_http_request("POST", url, body=json.dumps(chunk_payload).encode("utf-8"))
 
             if show_progress:
diff --git a/ddtrace/llmobs/_integrations/base.py b/ddtrace/llmobs/_integrations/base.py
index a6968ce0d83..25081cd0f0a 100644
--- a/ddtrace/llmobs/_integrations/base.py
+++ b/ddtrace/llmobs/_integrations/base.py
@@ -210,7 +210,8 @@ def llmobs_set_tags(
             return
         try:
             self._llmobs_set_tags(span, args, kwargs, response, operation)
-        except Exception:
+        except Exception as e:
+            print(e)
             log.error("Error extracting LLMObs fields for span %s, likely due to malformed data", span, exc_info=True)
 
     @abc.abstractmethod
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index ef5d96ffa88..5dff1ffaa43 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -53,6 +53,8 @@
 from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
 from ddtrace.llmobs._constants import TAGS
 from ddtrace.llmobs._constants import EXPECTED_OUTPUT
+from ddtrace.llmobs._constants import EXPERIMENT_INPUT
+from ddtrace.llmobs._constants import EXPERIMENT_OUTPUT
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.llmobs._utils import AnnotationContext
 from ddtrace.llmobs._utils import _get_llmobs_parent_id
@@ -194,8 +196,13 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
         span._set_ctx_item(ML_APP, ml_app)
         parent_id = str(_get_llmobs_parent_id(span) or "undefined")
 
+        # Experiments related
         if span._get_ctx_item(EXPECTED_OUTPUT) is not None:
             meta["expected_output"] = span._get_ctx_item(EXPECTED_OUTPUT)
+        if span._get_ctx_item(EXPERIMENT_INPUT) is not None:
+            meta["input"] = span._get_ctx_item(EXPERIMENT_INPUT)
+        if span._get_ctx_item(EXPERIMENT_OUTPUT) is not None:
+            meta["output"] = span._get_ctx_item(EXPERIMENT_OUTPUT)
 
         llmobs_span_event = {
             "trace_id": "{:x}".format(span.trace_id),
@@ -805,6 +812,8 @@ def annotate(
                 cls._tag_embedding_io(span, input_documents=input_data, output_text=output_data)
             elif span_kind == "retrieval":
                 cls._tag_retrieval_io(span, input_text=input_data, output_documents=output_data)
+            elif span_kind == "experiment":
+                cls._tag_experiment_io(span, input_data=input_data, output_data=output_data)
             else:
                 cls._tag_text_io(span, input_value=input_data, output_value=output_data)
 
@@ -906,6 +915,16 @@ def _tag_text_io(cls, span, input_value=None, output_value=None):
         if output_value is not None:
             span._set_ctx_item(OUTPUT_VALUE, str(output_value))
 
+    @classmethod
+    def _tag_experiment_io(cls, span, input_data=None, output_data=None):
+        """Tags input/output values for experiment kind spans.
+        Will be mapped to span's `meta.{input,output}.values` fields.
+        """
+        if input_data is not None:
+            span._set_ctx_item(EXPERIMENT_INPUT, str(input_data))
+        if output_data is not None:
+            span._set_ctx_item(EXPERIMENT_OUTPUT, str(output_data))
+
     @staticmethod
     def _tag_span_tags(span: Span, span_tags: Dict[str, Any]) -> None:
         """Tags a given LLMObs span with a dictionary of key-value tag pairs.
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 7f6be1d6fd4..39dad763389 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -190,101 +190,38 @@ def __len__(self):
     def _init_buffer(self):
         with self._lock:
             self._buffer = []
-            self._experiment_buffer = [] 
             self.buffer_size = 0
 
     def put(self, events: List[LLMObsSpanEvent]):
-        # Split incoming events into normal vs experiment spans
-        norm_events = []
-        exp_events = []
-        for e in events:
-            if e.get("meta", {}).get("span.kind") == "experiment":
-                exp_events.append(e)
-            else:
-                norm_events.append(e)
-
-        # Add normal spans to main buffer
-        if norm_events:
-            with self._lock:
-                if len(self._buffer) + len(norm_events) > self._buffer_limit:
-                    logger.warning("Dropping normal spans: buffer limit reached")
-                    return
-                self._buffer.extend(norm_events)
-                self.buffer_size += len(safe_json(norm_events))
-
-        # Add experiment spans to separate buffer
-        if exp_events:
-            with self._lock:
-                if len(self._experiment_buffer) + len(exp_events) > self._buffer_limit:
-                    logger.warning("Dropping experiment spans: buffer limit reached")
-                    return
-                self._experiment_buffer.extend(exp_events)
-                self.buffer_size += len(safe_json(exp_events))
+        # events always has only 1 event - with List type to be compatible with HTTPWriter interfaces
+        with self._lock:
+            if len(self._buffer) >= self._buffer_limit:
+                logger.warning(
+                    "%r event buffer full (limit is %d), dropping event", self.__class__.__name__, self._buffer_limit
+                )
+                return
+            self._buffer.extend(events)
+            self.buffer_size += len(safe_json(events))
 
     def encode(self):
-        """Encode only the normal spans for standard flush"""
         with self._lock:
             if not self._buffer:
                 return None, 0
             events = self._buffer
-            
-            # Save experiment buffer before _init_buffer() clears it
-            experiment_spans = self._experiment_buffer
             self._init_buffer()
-
-        data = {
-            "_dd.stage": "raw",
-            "_dd.tracer_version": ddtrace.__version__,
-            "event_type": "span",
-            "spans": events
-        }
-
-        if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")):
-            data["_dd.scope"] = "experiments"
-
-        try:
-            enc_data = safe_json(data)
-            if isinstance(enc_data, str):
-                enc_data = enc_data.encode('utf-8')
-            logger.debug("encode %d LLMObs span events", len(events))
-        except TypeError:
-            logger.error("failed to encode LLMObs span events", exc_info=True)
-            return None, 0
-
-        # Restore experiment buffer
-        with self._lock:
-            self._experiment_buffer = experiment_spans
-
-        return enc_data, len(events)
-
-    def encode_experiment_spans(self):
-        """Encode only the experiment spans for separate request"""
-        with self._lock:
-            if not self._experiment_buffer:
-                return None, 0
-            exp_events = self._experiment_buffer
-            self._experiment_buffer = []
-
-        data = {
-            "_dd.stage": "raw",
-            "_dd.tracer_version": ddtrace.__version__,
-            "event_type": "experiment-span",
-            "experiment_spans": exp_events
-        }
-
+        data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": events}
         if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")):
             data["_dd.scope"] = "experiments"
-
         try:
-            enc_data = safe_json(data)
-            if isinstance(enc_data, str):
-                enc_data = enc_data.encode('utf-8')
-            logger.debug("encode %d LLMObs experiment span events", len(exp_events))
+            enc_llm_events = safe_json(data)
+            if isinstance(enc_llm_events, str):
+                enc_llm_events = enc_llm_events.encode('utf-8')
+            logger.debug("encode %d LLMObs span events to be sent", len(events))
+            
         except TypeError:
-            logger.error("failed to encode LLMObs experiment span events", exc_info=True)
+            logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True)
             return None, 0
-
-        return enc_data, len(exp_events)
+        return enc_llm_events, len(events)
 
 
 class LLMObsEventClient(WriterClientBase):
@@ -378,33 +315,6 @@ def recreate(self):
             is_agentless=config._llmobs_agentless_enabled,
         )
 
-    def periodic(self) -> None:
-        # First flush normal spans using parent logic
-        super(LLMObsSpanWriter, self).periodic()
-
-        # Then flush experiment spans in a separate request
-        for client in self._clients:
-            if isinstance(client, LLMObsEventClient) and isinstance(client.encoder, LLMObsSpanEncoder):
-                encoded, count = client.encoder.encode_experiment_spans()
-                if not encoded or not count:
-                    continue
-
-                try:
-                    self._send_payload_with_backoff(encoded, count, client)
-                except Exception:
-                    self._metrics_dist("http.errors", tags=["type:err"])
-                    self._metrics_dist("http.dropped.bytes", len(encoded))
-                    self._metrics_dist("http.dropped.traces", count)
-                    logger.error(
-                        "failed to send %d experiment spans to %s",
-                        count,
-                        self.intake_url,
-                        exc_info=True
-                    )
-                else:
-                    self._metrics_dist("http.sent.bytes", len(encoded))
-                    self._metrics_dist("http.sent.traces", count)
-
 
 def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent:
     event["meta"]["input"] = {"value": DROPPED_VALUE_TEXT}

From 9ecc7889853b5456788be1d807ff4cde88944f0b Mon Sep 17 00:00:00 2001
From: Jonathan Chavez <jonathan.chavez@datadoghq.com>
Date: Tue, 11 Feb 2025 17:31:08 -0500
Subject: [PATCH 36/36] stabilize errors

---
 ddtrace/llmobs/_experiments.py | 164 +++++++++++++++++++--------------
 ddtrace/llmobs/_llmobs.py      |   5 +-
 ddtrace/llmobs/_writer.py      |   1 +
 3 files changed, 97 insertions(+), 73 deletions(-)

diff --git a/ddtrace/llmobs/_experiments.py b/ddtrace/llmobs/_experiments.py
index 335c0ff4e30..7b6d6618db3 100644
--- a/ddtrace/llmobs/_experiments.py
+++ b/ddtrace/llmobs/_experiments.py
@@ -23,7 +23,7 @@
 import ddtrace
 from ddtrace import patch_all
 
-patch_all()
+# patch_all() # TODO: remove this comment if it messes with dist tracing, right now it's needed because it overrides integrations_enabled
 
 DD_SITE = os.getenv("DD_SITE", "datadoghq.com")
 if DD_SITE == "datadoghq.com":
@@ -31,9 +31,6 @@
 else:
     BASE_URL = f"https://{DD_SITE}"
 
-class FileType(Enum):
-    CSV = 'csv'
-
 LLMObs.enable(
     ml_app="experiment-jonathan",
     integrations_enabled=True,
@@ -42,6 +39,43 @@ class FileType(Enum):
     api_key=os.getenv("DD_API_KEY"),
 )
 
+IS_INITIALIZED = False
+ENV_ML_APP = None
+ENV_PROJECT_NAME = None
+ENV_SITE = None
+ENV_API_KEY = None
+ENV_APPLICATION_KEY = None
+
+def init(project_name: str,  api_key: str = None, application_key: str = None, ml_app: str = "experiments", site: str = "datadoghq.com") -> None:
+    """Initialize an experiment environment.
+
+    Args:
+        project_name: Name of the project
+        api_key: Datadog API key
+        application_key: Datadog application key
+        ml_app: Name of the ML app
+        site: Datadog site
+    """
+
+    global IS_INITIALIZED
+    if IS_INITIALIZED:
+        raise ValueError("Experiment environment already initialized, please call init() only once")
+    else:
+        if api_key is None:
+            api_key = os.getenv("DD_API_KEY")
+            if api_key is None:
+                raise ValueError("DD_API_KEY environment variable is not set, please set it or pass it as an argument to init(api_key=...)")
+        if application_key is None:
+            application_key = os.getenv("DD_APPLICATION_KEY")
+            if application_key is None:
+                raise ValueError("DD_APPLICATION_KEY environment variable is not set, please set it or pass it as an argument to init(application_key=...)")
+
+        ENV_ML_APP = ml_app
+        ENV_PROJECT_NAME = project_name
+        ENV_SITE = site
+        ENV_API_KEY = api_key
+        ENV_APPLICATION_KEY = application_key
+        IS_INITIALIZED = True
 
 
 class Dataset:
@@ -331,38 +365,6 @@ def from_csv(
 
         return cls(name=name, data=data, description=description)
 
-    @classmethod
-    def load(cls, path: str, filetype: FileType, name: str, description: str = "", input_columns: List[str] = None, expected_output_columns: List[str] = None, metadata_columns: List[str] = None, delimiter: str = ",") -> "Dataset":
-        """Import a dataset from a file.
-
-        Args:
-            path (str): Path to the input file
-            filetype (FileType): Type of file to import (CSV, JSONL, or PARQUET)
-            name (str): Name of the dataset
-            description (str, optional): Description of the dataset. Defaults to "".
-            input_columns (List[str], optional): List of column names to use as input data. Required for CSV and PARQUET files.
-            expected_output_columns (List[str], optional): List of column names to use as expected output data. Required for CSV and PARQUET files.
-            metadata_columns (List[str], optional): List of column names to include as metadata. Defaults to None.
-            delimiter (str, optional): Delimiter character for CSV files. Defaults to ",".
-
-        Returns:
-            Dataset: A new Dataset instance containing the imported data
-
-        Raises:
-            ValueError: If filetype is not supported or if required columns are missing
-        """
-        if filetype == FileType.CSV:
-            return cls.from_csv(
-                filepath=path,
-                name=name,
-                description=description,
-                delimiter=delimiter,
-                input_columns=input_columns,
-                expected_output_columns=expected_output_columns,
-                metadata_columns=metadata_columns,
-            )
-        
-        raise ValueError(f"Unsupported file type: {filetype}")
 
     def as_dataframe(self, multiindex: bool = True) -> "pd.DataFrame":
         """Convert the dataset to a pandas DataFrame.
@@ -602,7 +604,7 @@ def _create_experiment_in_datadog(self) -> str:
     def run(
         self,
         jobs: int = 10,
-        raise_errors: bool = False,
+        raise_errors: bool = True,
     ) -> "ExperimentResults":
         """
         Execute the task and evaluations, returning the results.
@@ -629,7 +631,7 @@ def run(
         experiment_results = self.run_evaluations(raise_errors=raise_errors)
         return experiment_results
 
-    def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None:
+    def run_task(self, _jobs: int = 10, raise_errors: bool = True) -> None:
         """
         Execute the task function on the dataset concurrently using ThreadPoolExecutor.map,
         updating progress via _print_progress_bar and processing more rows in parallel.
@@ -640,21 +642,22 @@ def run_task(self, _jobs: int = 50, raise_errors: bool = False) -> None:
         def process_row(idx_row):
             idx, row = idx_row
             start_time = time.time()
-            try:
-                with LLMObs._experiment(name=self.task.__name__) as span:
-                    span.context.set_baggage_item("is_experiment_task", True)
-                    input_data = row["input"]
-                    expected_output = row["expected_output"]
+            with LLMObs._experiment(name=self.task.__name__) as span:
+                span.context.set_baggage_item("is_experiment_task", True)
+                span_context = LLMObs.export_span(span=span)
+                span_id = span_context["span_id"]
+                trace_id = span_context["trace_id"]
+                input_data = row["input"]
+                expected_output = row["expected_output"]
+
 
+                try:
+                    
                     if getattr(self.task, "_accepts_config", False):
                         output = self.task(input_data, self.config)
                     else:
                         output = self.task(input_data)
 
-                    # Periodic flush for concurrency
-                    if idx % 30 == 0:
-                        LLMObs.flush()
-
                     LLMObs.annotate(
                         span,
                         input_data=input_data,
@@ -667,9 +670,9 @@ def process_row(idx_row):
                     )
                     LLMObs._tag_expected_output(span, expected_output)
 
-                    span_context = LLMObs.export_span(span=span)
-                    span_id = span_context["span_id"]
-                    trace_id = span_context["trace_id"]
+                     # Periodic flush for concurrency
+                    if idx % 30 == 0:
+                        LLMObs.flush()
 
                     return {
                         "idx": idx,
@@ -686,31 +689,50 @@ def process_row(idx_row):
                         },
                         "error": {"message": None, "stack": None, "type": None},
                     }
-            except Exception as e:
-                error_message = str(e)
-                # In case of an exception, span_id and trace_id are set to None
-                return {
-                    "idx": idx,
-                    "output": None,
-                    "metadata": {
-                        "timestamp": start_time,
-                        "duration": time.time() - start_time,
-                        "dataset_record_index": idx,
-                        "project_name": self.project_name,
-                        "experiment_name": self.name,
-                        "dataset_name": self.dataset.name,
-                        "span_id": None,
-                        "trace_id": None,
-                    },
-                    "error": {
-                        "message": error_message,
-                        "stack": traceback.format_exc(),
-                        "type": type(e).__name__,
+                except Exception as e:
+                    error_message = str(e)
+                    span.error = 1
+                    span.set_exc_info(type(e), e, e.__traceback__)
+
+                    LLMObs.annotate(
+                        span,
+                        input_data=input_data,
+                        tags={
+                            "dataset_id": self.dataset._datadog_dataset_id,
+                            "dataset_record_id": row["record_id"],
+                            "experiment_id": self._datadog_experiment_id,
+                        },
+                    )
+                    LLMObs._tag_expected_output(span, expected_output)
+
+                    # Periodic flush for concurrency
+                    if idx % 30 == 0:
+                        LLMObs.flush()
+
+                    return {
+                        "idx": idx,
+                        "output": None,
+                        "metadata": {
+                            "timestamp": start_time,
+                            "duration": time.time() - start_time,
+                            "dataset_record_index": idx,
+                            "project_name": self.project_name,
+                            "experiment_name": self.name,
+                            "dataset_name": self.dataset.name,
+                            "span_id": span_id,
+                            "trace_id": trace_id,
+                        },
+                        "error": {
+                            "message": error_message,
+                            "stack": traceback.format_exc(),
+                            "type": type(e).__name__,
+                        }
                     }
-                }
 
         outputs_buffer = []
         completed = 0
+        error_count = 0
+
 
         # Using ThreadPoolExecutor.map to process rows concurrently
         with concurrent.futures.ThreadPoolExecutor(max_workers=_jobs) as executor:
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index 5dff1ffaa43..d8ccefe984a 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -223,6 +223,7 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
         llmobs_span_event["tags"] = cls._llmobs_tags(
             span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
         )
+
         return llmobs_span_event, is_ragas_integration_span
 
     @staticmethod
@@ -921,9 +922,9 @@ def _tag_experiment_io(cls, span, input_data=None, output_data=None):
         Will be mapped to span's `meta.{input,output}.values` fields.
         """
         if input_data is not None:
-            span._set_ctx_item(EXPERIMENT_INPUT, str(input_data))
+            span._set_ctx_item(EXPERIMENT_INPUT, input_data)
         if output_data is not None:
-            span._set_ctx_item(EXPERIMENT_OUTPUT, str(output_data))
+            span._set_ctx_item(EXPERIMENT_OUTPUT, output_data)
 
     @staticmethod
     def _tag_span_tags(span: Span, span_tags: Dict[str, Any]) -> None:
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index 39dad763389..8bbf15e5bec 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -221,6 +221,7 @@ def encode(self):
         except TypeError:
             logger.error("failed to encode %d LLMObs span events", len(events), exc_info=True)
             return None, 0
+        # print(enc_llm_events)
         return enc_llm_events, len(events)