From 4fc140e971155a0d8db35e01a37d8987d17092a4 Mon Sep 17 00:00:00 2001 From: nikki everett Date: Wed, 5 Feb 2025 21:34:14 -0700 Subject: [PATCH 1/9] convert to codeexample in dagster+ docs Signed-off-by: nikki everett --- .../ci-cd/branch-deployments/testing.md | 329 +----------------- 1 file changed, 9 insertions(+), 320 deletions(-) diff --git a/docs/docs-beta/docs/dagster-plus/features/ci-cd/branch-deployments/testing.md b/docs/docs-beta/docs/dagster-plus/features/ci-cd/branch-deployments/testing.md index 2f243201e3a83..58993c24a51bd 100644 --- a/docs/docs-beta/docs/dagster-plus/features/ci-cd/branch-deployments/testing.md +++ b/docs/docs-beta/docs/dagster-plus/features/ci-cd/branch-deployments/testing.md @@ -58,54 +58,7 @@ To set up a branch deployment workflow to construct and test these tables, we wi In production, we want to write three tables to Snowflake: `ITEMS`, `COMMENTS`, and `STORIES`. We can define these tables as assets as follows: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/assets.py startafter=start_assets endbefore=end_assets -# assets.py -import pandas as pd -import requests - -from dagster import Config, asset - - -class ItemsConfig(Config): - base_item_id: int - - -@asset( - io_manager_key="snowflake_io_manager", -) -def items(config: ItemsConfig) -> pd.DataFrame: - """Items from the Hacker News API: each is a story or a comment on a story.""" - rows = [] - max_id = requests.get( - "https://hacker-news.firebaseio.com/v0/maxitem.json", timeout=5 - ).json() - # Hacker News API is 1-indexed, so adjust range by 1 - for item_id in range(max_id - config.base_item_id + 1, max_id + 1): - item_url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json" - rows.append(requests.get(item_url, timeout=5).json()) - - # ITEM_FIELD_NAMES is a list of the column names in the Hacker News dataset - result = pd.DataFrame(rows, columns=ITEM_FIELD_NAMES).drop_duplicates(subset=["id"]) - result.rename(columns={"by": "user_id"}, inplace=True) - return result - - -@asset( - io_manager_key="snowflake_io_manager", -) -def comments(items: pd.DataFrame) -> pd.DataFrame: - """Comments from the Hacker News API.""" - return items[items["type"] == "comment"] - - -@asset( - io_manager_key="snowflake_io_manager", -) -def stories(items: pd.DataFrame) -> pd.DataFrame: - """Stories from the Hacker News API.""" - return items[items["type"] == "story"] -``` + As you can see, our assets use an [I/O manager](/guides/build/io-managers/) named `snowflake_io_manager`. Using I/O managers and other resources allow us to swap out implementations per environment without modifying our business logic. @@ -119,46 +72,7 @@ Dagster automatically sets certain [environment variables](/dagster-plus/deploym Because we want to configure our assets to write to Snowflake using a different set of credentials and database in each environment, we'll configure a separate I/O manager for each environment: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/branch_deployments/repository_v1.py startafter=start_repository endbefore=end_repository -# definitions.py -from dagster import Definitions - -from ..assets import comments, items, stories - -snowflake_config = { - "account": "abc1234.us-east-1", - "user": "system@company.com", - "password": {"env": "SYSTEM_SNOWFLAKE_PASSWORD"}, - "schema": "HACKER_NEWS", -} - -resources = { - "branch": { - "snowflake_io_manager": SnowflakePandasIOManager( - **snowflake_config, - database=f"PRODUCTION_CLONE_{os.getenv('DAGSTER_CLOUD_PULL_REQUEST_ID')}", - ), - }, - "prod": { - "snowflake_io_manager": SnowflakePandasIOManager( - **snowflake_config, - database="PRODUCTION", - ), - }, -} - - -def get_current_env(): - is_branch_depl = os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT") == "1" - assert is_branch_depl is not None # env var must be set - return "branch" if is_branch_depl else "prod" - - -defs = Definitions( - assets=[items, comments, stories], resources=resources[get_current_env()] -) -``` + Refer to the [Dagster+ environment variables documentation](/dagster-plus/deployment/management/environment-variables/) for more info about available environment variables. @@ -177,89 +91,17 @@ these tasks, like viewing them in the Global Asset Graph. ::: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/branch_deployments/clone_and_drop_db.py startafter=start_clone_db endbefore=end_clone_db -from dagster_snowflake import SnowflakeResource - -from dagster import In, Nothing, graph, op - - -@op -def drop_database_clone(snowflake: SnowflakeResource): - with snowflake.get_connection() as conn: - cur = conn.cursor() - cur.execute( - "DROP DATABASE IF EXISTS" - f" PRODUCTION_CLONE_{os.environ['DAGSTER_CLOUD_PULL_REQUEST_ID']}" - ) - - -@op(ins={"start": In(Nothing)}) -def clone_production_database(snowflake: SnowflakeResource): - with snowflake.get_connection() as conn: - cur = conn.cursor() - cur.execute( - "CREATE DATABASE" - f" PRODUCTION_CLONE_{os.environ['DAGSTER_CLOUD_PULL_REQUEST_ID']} CLONE" - ' "PRODUCTION"' - ) - - -@graph -def clone_prod(): - clone_production_database(start=drop_database_clone()) - - -@graph -def drop_prod_clone(): - drop_database_clone() -``` + We've defined `drop_database_clone` and `clone_production_database` to utilize the . The Snowflake resource will use the same configuration as the Snowflake I/O manager to generate a connection to Snowflake. However, while our I/O manager writes outputs to Snowflake, the Snowflake resource executes queries against Snowflake. We now need to define resources that configure our jobs to the current environment. We can modify the resource mapping by environment as follows: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/branch_deployments/repository_v2.py startafter=start_resources endbefore=end_resources -resources = { - "branch": { - "snowflake_io_manager": SnowflakePandasIOManager( - **snowflake_config, - database=f"PRODUCTION_CLONE_{os.getenv('DAGSTER_CLOUD_PULL_REQUEST_ID')}", - ), - "snowflake": SnowflakeResource( - **snowflake_config, - database=f"PRODUCTION_CLONE_{os.getenv('DAGSTER_CLOUD_PULL_REQUEST_ID')}", - ), - }, - "prod": { - "snowflake_io_manager": SnowflakePandasIOManager( - **snowflake_config, - database="PRODUCTION", - ), - "snowflake": SnowflakeResource(**snowflake_config, database="PRODUCTION"), - }, -} -``` + Then, we can add the `clone_prod` and `drop_prod_clone` jobs that now use the appropriate resource to the environment and add them to our definitions: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/branch_deployments/repository_v2.py startafter=start_repository endbefore=end_repository -branch_deployment_jobs = [ - clone_prod.to_job(), - drop_prod_clone.to_job(), -] -defs = Definitions( - assets=[items, comments, stories], - resources=resources[get_current_env()], - jobs=( - branch_deployment_jobs - if os.getenv("DAGSTER_CLOUD_IS_BRANCH_DEPLOYMENT") == "1" - else [] - ), -) -``` + ## Step 4: Create our database clone upon opening a branch @@ -268,37 +110,7 @@ defs = Definitions( The `branch_deployments.yml` file located in `.github/workflows/branch_deployments.yml` defines a `dagster_cloud_build_push` job with a series of steps that launch a branch deployment. Because we want to queue a run of `clone_prod` within each deployment after it launches, we'll add an additional step at the end `dagster_cloud_build_push`. This job is triggered on multiple pull request events: `opened`, `synchronize`, `reopen`, and `closed`. This means that upon future pushes to the branch, we'll trigger a run of `clone_prod`. The `if` condition below ensures that `clone_prod` will not run if the pull request is closed: -{/* TODO convert to */} -```yaml file=/guides/dagster/development_to_production/branch_deployments/clone_prod.yaml -# .github/workflows/branch_deployments.yml - -name: Dagster Branch Deployments - on: - pull_request: - types: [opened, synchronize, reopened, closed] - env: - DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} - - jobs: - dagster_cloud_build_push: - runs-on: ubuntu-latest - name: Dagster Branch Deployments - strategy: - ... - steps: - # Existing steps here - ... - - name: Clone Snowflake schema upon launch - if: github.event.action != 'closed' - uses: dagster-io/dagster-cloud-action/actions/utils/run@v0.1 - with: - location_name: ${{ matrix.location.name }} - deployment: ${{ steps.deploy.outputs.deployment }} - job_name: clone_prod - env: - DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} - DAGSTER_CLOUD_API_TOKEN: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} -``` + Opening a pull request for our current branch will automatically kick off a branch deployment. After the deployment launches, we can confirm that the `clone_prod` job has run: @@ -315,53 +127,7 @@ We can also view our database in Snowflake to confirm that a clone exists for ea The `.gitlab-ci.yaml` script contains a `deploy` job that defines a series of steps that launch a branch deployment. Because we want to queue a run of `clone_prod` within each deployment after it launches, we'll add an additional step at the end of `deploy`. This job is triggered on when a merge request is created or updated. This means that upon future pushes to the branch, we'll trigger a run of `clone_prod`. -```yaml file=/guides/dagster/development_to_production/branch_deployments/clone_prod.gitlab-ci.yml -# .gitlab-ci.yml - -stages: - - setup - - build - - deploy - -workflow: - rules: - - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH - - if: $CI_PIPELINE_SOURCE == 'merge_request_event' - -parse-workspace: - ... - -build-image: - ... - -deploy-docker: - ... - -deploy-docker-branch: - stage: deploy - rules: - - if: $CI_PIPELINE_SOURCE == 'merge_request_event' - dependencies: - - build-image - - parse-workspace - image: ghcr.io/dagster-io/dagster-cloud-action:0.1.23 - script: - # Existing steps here - ... - - # Add a step to launch the job cloning the prod db - - dagster-plus job launch - --url "$DAGSTER_CLOUD_URL/$DEPLOYMENT_NAME" - --api-token "$DAGSTER_CLOUD_API_TOKEN" - --location "location_name_containing_clone_prod_job" - --job clone_prod - environment: - name: branch/$CI_COMMIT_REF_NAME - on_stop: close_branch - -close_branch: - ... -``` + Opening a merge request for our current branch will automatically kick off a branch deployment. After the deployment launches, we can confirm that the `clone_prod` job has run: @@ -382,91 +148,14 @@ We can also view our database in Snowflake to confirm that a clone exists for ea Finally, we can add a step to our `branch_deployments.yml` file that queues a run of our `drop_prod_clone` job: -{/* TODO convert to */} -```yaml file=/guides/dagster/development_to_production/branch_deployments/drop_db_clone.yaml -# .github/workflows/branch_deployments.yml - -name: Dagster Branch Deployments - on: - pull_request: - types: [opened, synchronize, reopened, closed] - env: - DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} - - jobs: - dagster_cloud_build_push: - runs-on: ubuntu-latest - name: Dagster Branch Deployments - strategy: - ... - steps: - # Existing steps here - ... - - name: Clone Snowflake schema upon launch - ... - - name: Delete schema clone upon PR close - if: github.event.action == 'closed' - uses: dagster-io/dagster-cloud-action/actions/utils/run@v0.1 - with: - location_name: ${{ matrix.location.name }} - deployment: ${{ steps.deploy.outputs.deployment }} - job_name: drop_prod_clone - env: - DAGSTER_CLOUD_URL: ${{ secrets.DAGSTER_CLOUD_URL }} - DAGSTER_CLOUD_API_TOKEN: ${{ secrets.DAGSTER_CLOUD_API_TOKEN }} -``` + Finally, we can add a step to our `.gitlab-ci.yml` file that queues a run of our `drop_prod_clone` job: -{/* TODO convert to */} -```yaml file=/guides/dagster/development_to_production/branch_deployments/drop_db_clone.gitlab-ci.yml -# .gitlab-ci.yml - -stages: - - setup - - build - - deploy - -workflow: - rules: - - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH - - if: $CI_PIPELINE_SOURCE == 'merge_request_event' - -parse-workspace: - ... - -build-image: - ... - -deploy-docker: - ... - -deploy-docker-branch: - ... - -close_branch: - stage: deploy - image: ghcr.io/dagster-io/dagster-cloud-action:0.1.23 - when: manual - only: - - merge_requests - script: - # Existing steps here - ... - - # Add a step to launch the job dropping the cloned db - - dagster-plus job launch - --url "$DAGSTER_CLOUD_URL/$DEPLOYMENT_NAME" - --api-token "$DAGSTER_CLOUD_API_TOKEN" - --location "location_name_containing_drop_prod_clone_job" - --job drop_prod_clone - environment: - name: branch/$CI_COMMIT_REF_NAME - action: stop -``` + From d2b0bfdef0c47e80de7648aab9b2f4cdc1b71f1c Mon Sep 17 00:00:00 2001 From: nikki everett Date: Wed, 5 Feb 2025 23:04:30 -0700 Subject: [PATCH 2/9] update integrations docs code examples Signed-off-by: nikki everett --- .../libraries/dlt/using-dlt-with-dagster.md | 115 +--- .../libraries/duckdb/reference.md | 395 +------------- .../duckdb/using-duckdb-with-dagster.md | 208 +------- .../libraries/gcp/bigquery/reference.md | 419 +-------------- .../bigquery/using-bigquery-with-dagster.md | 228 +------- .../libraries/jupyter/reference.md | 92 +--- .../looker/using-looker-with-dagster.md | 119 +---- .../openai/using-openai-with-dagster.md | 67 +-- .../pandas/using-pandas-with-dagster.md | 92 +--- .../pandera/using-pandera-with-dagster.md | 42 +- .../powerbi/using-powerbi-with-dagster.md | 206 +------ .../sigma/using-sigma-with-dagster.md | 116 +--- .../libraries/snowflake/reference.md | 502 +----------------- ...sing-snowflake-with-dagster-io-managers.md | 112 +--- .../snowflake/using-snowflake-with-dagster.md | 97 +--- .../docs/integrations/libraries/tableau.md | 299 +---------- 16 files changed, 133 insertions(+), 2976 deletions(-) diff --git a/docs/docs-beta/docs/integrations/libraries/dlt/using-dlt-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/dlt/using-dlt-with-dagster.md index e52e1aff43885..34248a632c9f9 100644 --- a/docs/docs-beta/docs/integrations/libraries/dlt/using-dlt-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/dlt/using-dlt-with-dagster.md @@ -228,49 +228,7 @@ The method. -{/* TODO convert to */} -```python file=/integrations/dlt/dlt_dagster_translator.py -import dlt -from dagster_dlt import DagsterDltResource, DagsterDltTranslator, dlt_assets -from dagster_dlt.translator import DltResourceTranslatorData - -from dagster import AssetExecutionContext, AssetKey, AssetSpec - - -@dlt.source -def example_dlt_source(): - def example_resource(): ... - - return example_resource - - -class CustomDagsterDltTranslator(DagsterDltTranslator): - def get_asset_spec(self, data: DltResourceTranslatorData) -> AssetSpec: - """Overrides asset spec to: - - Override asset key to be the dlt resource name, - - Override upstream asset key to be a single source asset. - """ - default_spec = super().get_asset_spec(data) - return default_spec.replace_attributes( - key=AssetKey(f"{data.resource.name}"), - deps=[AssetKey("common_upstream_dlt_dependency")], - ) - - -@dlt_assets( - name="example_dlt_assets", - dlt_source=example_dlt_source(), - dlt_pipeline=dlt.pipeline( - pipeline_name="example_pipeline_name", - dataset_name="example_dataset_name", - destination="snowflake", - progress="log", - ), - dagster_dlt_translator=CustomDagsterDltTranslator(), -) -def dlt_example_assets(context: AssetExecutionContext, dlt: DagsterDltResource): - yield from dlt.run(context=context) -``` + In this example, we customized the translator to change how the dlt assets' names are defined. We also hard-coded the asset dependency upstream of our assets to provide a fan-out model from a single dependency to our dlt assets. @@ -282,38 +240,7 @@ This can be accomplished by defining a with attributes like `group_name`. -{/* TODO convert to */} -```python file=/integrations/dlt/dlt_source_assets.py -import dlt -from dagster_dlt import DagsterDltResource, dlt_assets - -from dagster import AssetExecutionContext, AssetSpec - - -@dlt.source -def example_dlt_source(): - def example_resource(): ... - - return example_resource - - -@dlt_assets( - dlt_source=example_dlt_source(), - dlt_pipeline=dlt.pipeline( - pipeline_name="example_pipeline_name", - dataset_name="example_dataset_name", - destination="snowflake", - progress="log", - ), -) -def example_dlt_assets(context: AssetExecutionContext, dlt: DagsterDltResource): - yield from dlt.run(context=context) - - -thinkific_source_assets = [ - AssetSpec(key, group_name="thinkific") for key in example_dlt_assets.dependency_keys -] -``` + ### Using partitions in your dlt assets @@ -321,43 +248,7 @@ While still an experimental feature, it is possible to use partitions within you That said, here is an example of using static named partitions from a dlt source. -{/* TODO convert to */} -```python file=/integrations/dlt/dlt_partitions.py -from typing import Optional - -import dlt -from dagster_dlt import DagsterDltResource, dlt_assets - -from dagster import AssetExecutionContext, StaticPartitionsDefinition - -color_partitions = StaticPartitionsDefinition(["red", "green", "blue"]) - - -@dlt.source -def example_dlt_source(color: Optional[str] = None): - def load_colors(): - if color: - # partition-specific processing - ... - else: - # non-partitioned processing - ... - - -@dlt_assets( - dlt_source=example_dlt_source(), - name="example_dlt_assets", - dlt_pipeline=dlt.pipeline( - pipeline_name="example_pipeline_name", - dataset_name="example_dataset_name", - destination="snowflake", - ), - partitions_def=color_partitions, -) -def compute(context: AssetExecutionContext, dlt: DagsterDltResource): - color = context.partition_key - yield from dlt.run(context=context, dlt_source=example_dlt_source(color=color)) -``` + ## What's next? diff --git a/docs/docs-beta/docs/integrations/libraries/duckdb/reference.md b/docs/docs-beta/docs/integrations/libraries/duckdb/reference.md index 127de67758b18..1b7c73e2d8b57 100644 --- a/docs/docs-beta/docs/integrations/libraries/duckdb/reference.md +++ b/docs/docs-beta/docs/integrations/libraries/duckdb/reference.md @@ -27,24 +27,7 @@ For further information on the DuckDB resource, see the [DuckDB resource API doc ### Executing custom SQL queries -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/resource.py startafter=start endbefore=end -from dagster_duckdb import DuckDBResource - -from dagster import asset - -# this example executes a query against the iris_dataset table created in Step 2 of the -# Using Dagster with DuckDB tutorial - - -@asset(deps=[iris_dataset]) -def small_petals(duckdb: DuckDBResource) -> None: - with duckdb.get_connection() as conn: # conn is a DuckDBPyConnection - conn.execute( - "CREATE TABLE iris.small_petals AS SELECT * FROM iris.iris_dataset WHERE" - " 'petal_length_cm' < 1 AND 'petal_width_cm' < 1" - ) -``` + In this example, we attach the DuckDB resource to the `small_petals` asset. In the body of the asset function, we use the `get_connection` context manager on the resource to get a [`duckdb.DuckDBPyConnection`](https://duckdb.org/docs/api/python/reference/#duckdb.DuckDBPyConnection). We can use this connection to execute a custom SQL query against the `iris_dataset` table created in [Step 2: Create tables in DuckDB](using-duckdb-with-dagster#option-1-step-2) of the [Using Dagster with DuckDB tutorial](using-duckdb-with-dagster). When the `duckdb.get_connection` context is exited, the DuckDB connection will be closed. @@ -56,29 +39,7 @@ The DuckDB I/O manager provides several ways to customize how your data is store Sometimes you may not want to fetch an entire table as the input to a downstream asset. With the DuckDB I/O manager, you can select specific columns to load by supplying metadata on the downstream asset. -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/downstream_columns.py -import pandas as pd - -from dagster import AssetIn, asset - -# this example uses the iris_dataset asset from Step 2 of the Using Dagster with DuckDB tutorial - - -@asset( - ins={ - "iris_sepal": AssetIn( - key="iris_dataset", - metadata={"columns": ["sepal_length_cm", "sepal_width_cm"]}, - ) - } -) -def sepal_data(iris_sepal: pd.DataFrame) -> pd.DataFrame: - iris_sepal["sepal_area_cm2"] = ( - iris_sepal["sepal_length_cm"] * iris_sepal["sepal_width_cm"] - ) - return iris_sepal -``` + In this example, we only use the columns containing sepal data from the `IRIS_DATASET` table created in [Step 2: Create tables in DuckDB](using-duckdb-with-dagster#option-2-step-2) of the [Using Dagster with DuckDB tutorial](using-duckdb-with-dagster). To select specific columns, we can add metadata to the input asset. We do this in the `metadata` parameter of the `AssetIn` that loads the `iris_dataset` asset in the `ins` parameter. We supply the key `columns` with a list of names of the columns we want to fetch. @@ -95,40 +56,7 @@ In the following sections, we describe how the I/O manager constructs these quer To store static partitioned assets in DuckDB, specify `partition_expr` metadata on the asset to tell the DuckDB I/O manager which column contains the partition data: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/static_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import AssetExecutionContext, StaticPartitionsDefinition, asset - - -@asset( - partitions_def=StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - metadata={"partition_expr": "SPECIES"}, -) -def iris_dataset_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - species = context.partition_key - - full_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - return full_df[full_df["Species"] == species] - - -@asset -def iris_cleaned(iris_dataset_partitioned: pd.DataFrame): - return iris_dataset_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the partition in the downstream asset. When loading a static partition (or multiple static partitions), the following statement is used: @@ -151,30 +79,7 @@ SELECT * Like static partitioned assets, you can specify `partition_expr` metadata on the asset to tell the DuckDB I/O manager which column contains the partition data: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/time_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import AssetExecutionContext, DailyPartitionsDefinition, asset - - -@asset( - partitions_def=DailyPartitionsDefinition(start_date="2023-01-01"), - metadata={"partition_expr": "TO_TIMESTAMP(TIME)"}, -) -def iris_data_per_day(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'time' with that stores - # the time of the row as an integer of seconds since epoch - return get_iris_data_for_date(partition) - - -@asset -def iris_cleaned(iris_data_per_day: pd.DataFrame): - return iris_data_per_day.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in the downstream asset. When loading a dynamic partition, the following statement is used: @@ -201,47 +106,7 @@ In this example, the data in the `TIME` column are integers, so the `partition_e The DuckDB I/O manager can also store data partitioned on multiple dimensions. To do this, specify the column for each partition as a dictionary of `partition_expr` metadata: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/multi_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import ( - AssetExecutionContext, - DailyPartitionsDefinition, - MultiPartitionsDefinition, - StaticPartitionsDefinition, - asset, -) - - -@asset( - partitions_def=MultiPartitionsDefinition( - { - "date": DailyPartitionsDefinition(start_date="2023-01-01"), - "species": StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - } - ), - metadata={"partition_expr": {"date": "TO_TIMESTAMP(TIME)", "species": "SPECIES"}}, -) -def iris_dataset_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key.keys_by_dimension - species = partition["species"] - date = partition["date"] - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'time' with that stores - # the time of the row as an integer of seconds since epoch - full_df = get_iris_data_for_date(date) - - return full_df[full_df["species"] == species] - - -@asset -def iris_cleaned(iris_dataset_partitioned: pd.DataFrame): - return iris_dataset_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in a downstream asset. For multi-partitions, Dagster concatenates the `WHERE` statements described in the above sections to craft the correct `SELECT` statement. @@ -267,49 +132,16 @@ You can specify the default schema where data will be stored as configuration to If you want to store assets in different schemas, you can specify the schema as metadata: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/schema.py startafter=start_metadata endbefore=end_metadata dedent=4 -daffodil_dataset = AssetSpec( - key=["daffodil_dataset"], metadata={"schema": "daffodil"} -) - -@asset(metadata={"schema": "iris"}) -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` +{/* TODO add dedent to this CodeExample */} + You can also specify the schema as part of the asset's key: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/schema.py startafter=start_asset_key endbefore=end_asset_key dedent=4 -daffodil_dataset = AssetSpec(key=["daffodil", "daffodil_dataset"]) - -@asset(key_prefix=["iris"]) -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` + In this example, the `iris_dataset` asset will be stored in the `IRIS` schema, and the `daffodil_dataset` asset will be found in the `DAFFODIL` schema. -::: +:::note The schema is determined in this order:
    @@ -331,47 +163,7 @@ In this example, the `iris_dataset` asset will be stored in the `IRIS` schema, a You may have assets that you don't want to store in DuckDB. You can provide an I/O manager to each asset using the `io_manager_key` parameter in the decorator: -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/multiple_io_managers.py startafter=start_example endbefore=end_example -import pandas as pd -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_duckdb_pandas import DuckDBPandasIOManager - -from dagster import Definitions, asset - - -@asset(io_manager_key="warehouse_io_manager") -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset(io_manager_key="blob_io_manager") -def iris_plots(iris_dataset): - # plot_data is a function we've defined somewhere else - # that plots the data in a DataFrame - return plot_data(iris_dataset) - - -defs = Definitions( - assets=[iris_dataset, iris_plots], - resources={ - "warehouse_io_manager": DuckDBPandasIOManager( - database="path/to/my_duckdb_database.duckdb", - schema="IRIS", - ), - "blob_io_manager": s3_pickle_io_manager, - }, -) -``` + In this example: @@ -394,114 +186,20 @@ pip install dagster-duckdb-pyspark Then you can use the `DuckDBPySparkIOManager` in your as in [Step 1: Configure the DuckDB I/O manager](using-duckdb-with-dagster#step-1-configure-the-duckdb-io-manager) of the [Using Dagster with DuckDB tutorial](using-duckdb-with-dagster). -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/pyspark_configuration.py startafter=start_configuration endbefore=end_configuration -from dagster_duckdb_pyspark import DuckDBPySparkIOManager - -from dagster import Definitions - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": DuckDBPySparkIOManager( - database="path/to/my_duckdb_database.duckdb", # required - schema="IRIS", # optional, defaults to PUBLIC - ) - }, -) -``` + The `DuckDBPySparkIOManager` requires an active `SparkSession`. You can either create your own `SparkSession` or use the . -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/pyspark_with_spark_resource.py -from dagster_duckdb_pyspark import DuckDBPySparkIOManager -from dagster_pyspark import pyspark_resource -from pyspark import SparkFiles -from pyspark.sql import DataFrame -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import AssetExecutionContext, Definitions, asset - - -@asset(required_resource_keys={"pyspark"}) -def iris_dataset(context: AssetExecutionContext) -> DataFrame: - spark = context.resources.pyspark.spark_session - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = "https://docs.dagster.io/assets/iris.csv" - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": DuckDBPySparkIOManager( - database="path/to/my_duckdb_database.duckdb", - schema="IRIS", - ), - "pyspark": pyspark_resource, - }, -) -``` + -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/pyspark_with_spark_session.py startafter=start endbefore=end -from dagster_duckdb_pyspark import DuckDBPySparkIOManager -from pyspark import SparkFiles -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import Definitions, asset - - -@asset -def iris_dataset() -> DataFrame: - spark = SparkSession.builder.getOrCreate() - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = "https://docs.dagster.io/assets/iris.csv" - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": DuckDBPySparkIOManager( - database="path/to/my_duckdb_database.duckdb", - schema="IRIS", - ) - }, -) -``` + + @@ -517,22 +215,8 @@ pip install dagster-duckdb-polars Then you can use the `DuckDBPolarsIOManager` in your as in [Step 1: Configure the DuckDB I/O manager](using-duckdb-with-dagster#step-1-configure-the-duckdb-io-manager) of the [Using Dagster with DuckDB tutorial](using-duckdb-with-dagster). -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/polars_configuration.py startafter=start_configuration endbefore=end_configuration -from dagster_duckdb_polars import DuckDBPolarsIOManager - -from dagster import Definitions - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": DuckDBPolarsIOManager( - database="path/to/my_duckdb_database.duckdb", # required - schema="IRIS", # optional, defaults to PUBLIC - ) - }, -) -``` + + @@ -543,48 +227,5 @@ If you work with several DataFrame libraries and want a single I/O manager to ha To do this, inherit from the base class and implement the `type_handlers` and `default_load_type` methods. The resulting I/O manager will inherit the configuration fields of the base `DuckDBIOManager`. -{/* TODO convert to */} -```python file=/integrations/duckdb/reference/multiple_dataframe_types.py startafter=start_example endbefore=end_example -from typing import Optional, Type - -import pandas as pd -from dagster_duckdb import DuckDBIOManager -from dagster_duckdb_pandas import DuckDBPandasTypeHandler -from dagster_duckdb_polars import DuckDBPolarsTypeHandler -from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler - -from dagster import Definitions - - -class DuckDBPandasPySparkPolarsIOManager(DuckDBIOManager): - @staticmethod - def type_handlers(): - """type_handlers should return a list of the TypeHandlers that the I/O manager can use. - Here we return the DuckDBPandasTypeHandler, DuckDBPySparkTypeHandler, and DuckDBPolarsTypeHandler so that the I/O - manager can store Pandas DataFrames, PySpark DataFrames, and Polars DataFrames. - """ - return [ - DuckDBPandasTypeHandler(), - DuckDBPySparkTypeHandler(), - DuckDBPolarsTypeHandler(), - ] - - @staticmethod - def default_load_type() -> Optional[type]: - """If an asset is not annotated with an return type, default_load_type will be used to - determine which TypeHandler to use to store and load the output. - In this case, unannotated assets will be stored and loaded as Pandas DataFrames. - """ - return pd.DataFrame - - -defs = Definitions( - assets=[iris_dataset, rose_dataset], - resources={ - "io_manager": DuckDBPandasPySparkPolarsIOManager( - database="path/to/my_duckdb_database.duckdb", - schema="IRIS", - ) - }, -) -``` + + diff --git a/docs/docs-beta/docs/integrations/libraries/duckdb/using-duckdb-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/duckdb/using-duckdb-with-dagster.md index 6fd719ff3ac5e..f241883908062 100644 --- a/docs/docs-beta/docs/integrations/libraries/duckdb/using-duckdb-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/duckdb/using-duckdb-with-dagster.md @@ -49,21 +49,7 @@ To complete this tutorial, you'll need: To use the DuckDB resource, you'll need to add it to your `Definitions` object. The DuckDB resource requires some configuration. You must set a path to a DuckDB database as the `database` configuration value. If the database does not already exist, it will be created for you: -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/resource/configuration.py startafter=start_example endbefore=end_example -from dagster_duckdb import DuckDBResource - -from dagster import Definitions - -defs = Definitions( - assets=[iris_dataset], - resources={ - "duckdb": DuckDBResource( - database="path/to/my_duckdb_database.duckdb", # required - ) - }, -) -``` + ### Step 2: Create tables in DuckDB \{#option-1-step-2} @@ -75,30 +61,7 @@ defs = Definitions( Using the DuckDB resource, you can create DuckDB tables using the DuckDB Python API: -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/resource/create_table.py startafter=start_example endbefore=end_example -import pandas as pd -from dagster_duckdb import DuckDBResource - -from dagster import asset - - -@asset -def iris_dataset(duckdb: DuckDBResource) -> None: - iris_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - with duckdb.get_connection() as conn: - conn.execute("CREATE TABLE iris.iris_dataset AS SELECT * FROM iris_df") -``` + In this example, you're defining an asset that fetches the Iris dataset as a Pandas DataFrame and renames the columns. Then, using the DuckDB resource, the DataFrame is stored in DuckDB as the `iris.iris_dataset` table. @@ -110,12 +73,8 @@ In this example, you're defining an asset that fetches the Iris dataset as a Pan If you already have existing tables in DuckDB and other assets defined in Dagster depend on those tables, you may want Dagster to be aware of those upstream dependencies. Making Dagster aware of these tables will allow you to track the full data lineage in Dagster. You can accomplish this by defining [external assets](/guides/build/assets/external-assets) for these tables. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/source_asset.py -from dagster import AssetSpec -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, you're creating a for a pre-existing table called `iris_harvest_data`. @@ -129,21 +88,7 @@ Now you can run `dagster dev` and materialize the `iris_dataset` asset from the Once you have created an asset that represents a table in DuckDB, you will likely want to create additional assets that work with the data. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/resource/downstream.py startafter=start_example endbefore=end_example -from dagster import asset - -# this example uses the iris_dataset asset from Step 1 - - -@asset(deps=[iris_dataset]) -def iris_setosa(duckdb: DuckDBResource) -> None: - with duckdb.get_connection() as conn: - conn.execute( - "CREATE TABLE iris.iris_setosa AS SELECT * FROM iris.iris_dataset WHERE" - " species = 'Iris-setosa'" - ) -``` + In this asset, you're creating second table that only contains the data for the _Iris Setosa_ species. This asset has a dependency on the `iris_dataset` asset. To define this dependency, you provide the `iris_dataset` asset as the `deps` parameter to the `iris_setosa` asset. You can then run the SQL query to create the table of _Iris Setosa_ data. @@ -151,51 +96,7 @@ In this asset, you're creating second table that only contains the data for the When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/resource/full_example.py -import pandas as pd -from dagster_duckdb import DuckDBResource - -from dagster import AssetSpec, Definitions, asset - -iris_harvest_data = AssetSpec(key="iris_harvest_data") - - -@asset -def iris_dataset(duckdb: DuckDBResource) -> None: - iris_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - with duckdb.get_connection() as conn: - conn.execute("CREATE TABLE iris.iris_dataset AS SELECT * FROM iris_df") - - -@asset(deps=[iris_dataset]) -def iris_setosa(duckdb: DuckDBResource) -> None: - with duckdb.get_connection() as conn: - conn.execute( - "CREATE TABLE iris.iris_setosa AS SELECT * FROM iris.iris_dataset WHERE" - " species = 'Iris-setosa'" - ) - - -defs = Definitions( - assets=[iris_dataset], - resources={ - "duckdb": DuckDBResource( - database="path/to/my_duckdb_database.duckdb", - ) - }, -) -``` + ## Option 2: Using the DuckDB I/O manager @@ -212,22 +113,7 @@ This section of the guide focuses on storing and loading Pandas DataFrames in Du To use the DuckDB I/O, you'll need to add it to your `Definitions` object. The DuckDB I/O manager requires some configuration to connect to your database. You must provide a path where a DuckDB database will be created. Additionally, you can specify a `schema` where the DuckDB I/O manager will create tables. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/configuration.py startafter=start_example endbefore=end_example -from dagster_duckdb_pandas import DuckDBPandasIOManager - -from dagster import Definitions - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": DuckDBPandasIOManager( - database="path/to/my_duckdb_database.duckdb", # required - schema="IRIS", # optional, defaults to PUBLIC - ) - }, -) -``` + ### Step 2: Create tables in DuckDB \{#option-2-step-2} @@ -241,26 +127,7 @@ The DuckDB I/O manager can create and update tables for your Dagster-defined ass To store data in DuckDB using the DuckDB I/O manager, you can simply return a Pandas DataFrame from your asset. Dagster will handle storing and loading your assets in DuckDB. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/basic_example.py -import pandas as pd - -from dagster import asset - - -@asset -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` + In this example, you're defining an asset that fetches the Iris dataset as a Pandas DataFrame, renames the columns, then returns the DataFrame. The type signature of the function tells the I/O manager what data type it is working with, so it is important to include the return type `pd.DataFrame`. @@ -274,12 +141,7 @@ When Dagster materializes the `iris_dataset` asset using the configuration from If you already have existing tables in DuckDB and other assets defined in Dagster depend on those tables, you may want Dagster to be aware of those upstream dependencies. Making Dagster aware of these tables will allow you to track the full data lineage in Dagster. You can accomplish this by defining [external assets](/guides/build/assets/external-assets) for these tables. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/source_asset.py -from dagster import AssetSpec - -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, you're creating a for a pre-existing table containing iris harvests data. To make the data available to other Dagster assets, you need to tell the DuckDB I/O manager how to find the data. @@ -292,19 +154,7 @@ Because you already supplied the database and schema in the I/O manager configur Once you have created an asset that represents a table in DuckDB, you will likely want to create additional assets that work with the data. Dagster and the DuckDB I/O manager allow you to load the data stored in DuckDB tables into downstream assets. -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/load_downstream.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import asset - -# this example uses the iris_dataset asset from Step 2 - - -@asset -def iris_setosa(iris_dataset: pd.DataFrame) -> pd.DataFrame: - return iris_dataset[iris_dataset["species"] == "Iris-setosa"] -``` + In this asset, you're providing the `iris_dataset` asset as a dependency to `iris_setosa`. By supplying `iris_dataset` as a parameter to `iris_setosa`, Dagster knows to use the `DuckDBPandasIOManager` to load this asset into memory as a Pandas DataFrame and pass it as an argument to `iris_setosa`. Next, a DataFrame that only contains the data for the _Iris Setosa_ species is created and returned. Then the `DuckDBPandasIOManager` will store the DataFrame as the `IRIS.IRIS_SETOSA` table in DuckDB. @@ -312,45 +162,7 @@ In this asset, you're providing the `iris_dataset` asset as a dependency to `iri When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/duckdb/tutorial/io_manager/full_example.py -import pandas as pd -from dagster_duckdb_pandas import DuckDBPandasIOManager - -from dagster import AssetSpec, Definitions, asset - -iris_harvest_data = AssetSpec(key="iris_harvest_data") - - -@asset -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset -def iris_setosa(iris_dataset: pd.DataFrame) -> pd.DataFrame: - return iris_dataset[iris_dataset["species"] == "Iris-setosa"] - - -defs = Definitions( - assets=[iris_dataset, iris_harvest_data, iris_setosa], - resources={ - "io_manager": DuckDBPandasIOManager( - database="path/to/my_duckdb_database.duckdb", - schema="IRIS", - ) - }, -) -``` + ## Related diff --git a/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/reference.md b/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/reference.md index 821f289136a45..ba203cd3717f7 100644 --- a/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/reference.md +++ b/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/reference.md @@ -29,53 +29,13 @@ cat ~/.gcp/key.json | base64 Then you can [set an environment variable](/guides/deploy/using-environment-variables-and-secrets) in your Dagster deployment (for example `GCP_CREDS`) to the encoded key and provide it to the BigQuery I/O manager: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/config_auth.py startafter=start_example endbefore=end_example -from dagster_gcp_pandas import BigQueryPandasIOManager - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_data], - resources={ - "io_manager": BigQueryPandasIOManager( - project="my-gcp-project", - location="us-east5", - dataset="IRIS", - timeout=15.0, - gcp_credentials=EnvVar("GCP_CREDS"), - ) - }, -) -``` + ## Selecting specific columns in a downstream asset Sometimes you may not want to fetch an entire table as the input to a downstream asset. With the BigQuery I/O manager, you can select specific columns to load by supplying metadata on the downstream asset. -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/downstream_columns.py -import pandas as pd - -from dagster import AssetIn, asset - -# this example uses the iris_data asset from Step 2 of the Using Dagster with BigQuery tutorial - - -@asset( - ins={ - "iris_sepal": AssetIn( - key="iris_data", - metadata={"columns": ["sepal_length_cm", "sepal_width_cm"]}, - ) - } -) -def sepal_data(iris_sepal: pd.DataFrame) -> pd.DataFrame: - iris_sepal["sepal_area_cm2"] = ( - iris_sepal["sepal_length_cm"] * iris_sepal["sepal_width_cm"] - ) - return iris_sepal -``` + In this example, we only use the columns containing sepal data from the `IRIS_DATA` table created in [Step 2: Create tables in BigQuery](using-bigquery-with-dagster#step-2-create-tables-in-bigquery) of the [Using Dagster with BigQuery tutorial](using-bigquery-with-dagster). Fetching the entire table would be unnecessarily costly, so to select specific columns, we can add metadata to the input asset. We do this in the `metadata` parameter of the `AssetIn` that loads the `iris_data` asset in the `ins` parameter. We supply the key `columns` with a list of names of the columns we want to fetch. @@ -92,40 +52,7 @@ The BigQuery I/O manager supports storing and loading partitioned data. In order In order to store static partitioned assets in BigQuery, you must specify `partition_expr` metadata on the asset to tell the BigQuery I/O manager which column contains the partition data: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/static_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import AssetExecutionContext, StaticPartitionsDefinition, asset - - -@asset( - partitions_def=StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - metadata={"partition_expr": "SPECIES"}, -) -def iris_data_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - species = context.partition_key - - full_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - return full_df[full_df["species"] == species] - - -@asset -def iris_cleaned(iris_data_partitioned: pd.DataFrame): - return iris_data_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the partition in the downstream asset. When loading a static partition, the following statement is used: @@ -150,30 +77,7 @@ SELECT * Like static partitioned assets, you can specify `partition_expr` metadata on the asset to tell the BigQuery I/O manager which column contains the partition data: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/time_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import AssetExecutionContext, DailyPartitionsDefinition, asset - - -@asset( - partitions_def=DailyPartitionsDefinition(start_date="2023-01-01"), - metadata={"partition_expr": "TIMESTAMP_SECONDS(TIME)"}, -) -def iris_data_per_day(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'TIME' with that stores - # the time of the row as an integer of seconds since epoch - return get_iris_data_for_date(partition) - - -@asset -def iris_cleaned(iris_data_per_day: pd.DataFrame): - return iris_data_per_day.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in the downstream asset. When loading a dynamic partition, the following statement is used: @@ -202,49 +106,7 @@ In this example, the data in the `TIME` column are integers, so the `partition_e The BigQuery I/O manager can also store data partitioned on multiple dimensions. To do this, you must specify the column for each partition as a dictionary of `partition_expr` metadata: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/multi_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import ( - AssetExecutionContext, - DailyPartitionsDefinition, - MultiPartitionsDefinition, - StaticPartitionsDefinition, - asset, -) - - -@asset( - partitions_def=MultiPartitionsDefinition( - { - "date": DailyPartitionsDefinition(start_date="2023-01-01"), - "species": StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - } - ), - metadata={ - "partition_expr": {"date": "TIMESTAMP_SECONDS(TIME)", "species": "SPECIES"} - }, -) -def iris_data_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key.keys_by_dimension - species = partition["species"] - date = partition["date"] - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'TIME' with that stores - # the time of the row as an integer of seconds since epoch - full_df = get_iris_data_for_date(date) - - return full_df[full_df["species"] == species] - - -@asset -def iris_cleaned(iris_data_partitioned: pd.DataFrame): - return iris_data_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in a downstream asset. For multi-partitions, Dagster concatenates the `WHERE` statements described in the static partition and time-window partition sections to craft the correct `SELECT` statement. @@ -268,47 +130,16 @@ You can specify the default dataset where data will be stored as configuration t If you want to store assets in different datasets, you can specify the dataset as metadata: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/dataset.py startafter=start_metadata endbefore=end_metadata dedent=4 -daffodil_data = AssetSpec(key=["daffodil_data"], metadata={"schema": "daffodil"}) - -@asset(metadata={"schema": "iris"}) -def iris_data() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` + You can also specify the dataset as part of the asset's asset key: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/dataset.py startafter=start_asset_key endbefore=end_asset_key dedent=4 -daffodil_data = AssetSpec(key=["gcp", "bigquery", "daffodil", "daffodil_data"]) - -@asset(key_prefix=["gcp", "bigquery", "iris"]) -def iris_data() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` +{/* TODO add dedent=4 to CodeExample below */} + The dataset will be the last prefix before the asset's name. In this example, the `iris_data` asset will be stored in the `IRIS` dataset, and the `daffodil_data` asset will be found in the `DAFFODIL` dataset. -::: +:::note The dataset is determined in this order:
      @@ -332,47 +163,7 @@ The dataset will be the last prefix before the asset's name. In this example, th You may have assets that you don't want to store in BigQuery. You can provide an I/O manager to each asset using the `io_manager_key` parameter in the `asset` decorator: -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/multiple_io_managers.py startafter=start_example endbefore=end_example -import pandas as pd -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_gcp_pandas import BigQueryPandasIOManager - -from dagster import Definitions, asset - - -@asset(io_manager_key="warehouse_io_manager") -def iris_data() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset(io_manager_key="blob_io_manager") -def iris_plots(iris_data): - # plot_data is a function we've defined somewhere else - # that plots the data in a DataFrame - return plot_data(iris_data) - - -defs = Definitions( - assets=[iris_data, iris_plots], - resources={ - "warehouse_io_manager": BigQueryPandasIOManager( - project="my-gcp-project", - dataset="IRIS", - ), - "blob_io_manager": s3_pickle_io_manager, - }, -) -``` + In this example, the `iris_data` asset uses the I/O manager bound to the key `warehouse_io_manager` and `iris_plots` will use the I/O manager bound to the key `blob_io_manager`. In the object, we supply the I/O managers for those keys. When the assets are materialized, the `iris_data` will be stored in BigQuery, and `iris_plots` will be saved in Amazon S3. @@ -386,26 +177,9 @@ pip install dagster-gcp-pyspark Then you can use the `gcp_pyspark_io_manager` in your `Definitions` as in [Step 1: Configure the BigQuery I/O manager](using-bigquery-with-dagster#step-1-configure-the-bigquery-io-manager) of the [Using Dagster with BigQuery tutorial](using-bigquery-with-dagster). -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/pyspark_configuration.py startafter=start_configuration endbefore=end_configuration -from dagster_gcp_pyspark import BigQueryPySparkIOManager - -from dagster import Definitions - -defs = Definitions( - assets=[iris_data], - resources={ - "io_manager": BigQueryPySparkIOManager( - project="my-gcp-project", # required - location="us-east5", # optional, defaults to the default location for the project - see https://cloud.google.com/bigquery/docs/locations for a list of locations - dataset="IRIS", # optional, defaults to PUBLIC - temporary_gcs_bucket="my-gcs-bucket", # optional, defaults to None, which will result in a direct write to BigQuery - ) - }, -) -``` + -::: +:::note When using the `BigQueryPySparkIOManager` you may provide the `temporary_gcs_bucket` configuration. This will store the data is a temporary GCS bucket, then all of the data into BigQuery in one operation. If not provided, data will be directly written to BigQuery. If you choose to use a temporary GCS bucket, you must include the [GCS Hadoop connector](https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs) in your Spark Session, in addition to the BigQuery connector (described below). @@ -416,108 +190,21 @@ The `BigQueryPySparkIOManager` requires that a `SparkSession` be active and conf -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/pyspark_with_spark_resource.py -from dagster_gcp_pyspark import BigQueryPySparkIOManager -from dagster_pyspark import pyspark_resource -from pyspark import SparkFiles -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import AssetExecutionContext, Definitions, asset - -BIGQUERY_JARS = "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.28.0" - - -@asset(required_resource_keys={"pyspark"}) -def iris_data(context: AssetExecutionContext) -> DataFrame: - spark = context.resources.pyspark.spark_session - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = "https://docs.dagster.io/assets/iris.csv" - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_data], - resources={ - "io_manager": BigQueryPySparkIOManager( - project="my-gcp-project", - location="us-east5", - ), - "pyspark": pyspark_resource.configured( - {"spark_conf": {"spark.jars.packages": BIGQUERY_JARS}} - ), - }, -) -``` + + -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/pyspark_with_spark_session.py -from dagster_gcp_pyspark import BigQueryPySparkIOManager -from pyspark import SparkFiles -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import Definitions, asset - -BIGQUERY_JARS = "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.28.0" - - -@asset -def iris_data() -> DataFrame: - spark = SparkSession.builder.config( - key="spark.jars.packages", - value=BIGQUERY_JARS, - ).getOrCreate() - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = "https://docs.dagster.io/assets/iris.csv" - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_data], - resources={ - "io_manager": BigQueryPySparkIOManager( - project="my-gcp-project", - location="us-east5", - ), - }, -) -``` + + -::: +:::note -**Note:** In order to load data from BigQuery as a PySpark DataFrame, the BigQuery PySpark connector will create a view containing the data. This will result in the creation of a temporary table in your BigQuery dataset. For more details, see the [BigQuery PySpark connector documentation](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#reading-data-from-a-bigquery-query). +In order to load data from BigQuery as a PySpark DataFrame, the BigQuery PySpark connector will create a view containing the data. This will result in the creation of a temporary table in your BigQuery dataset. For more details, see the [BigQuery PySpark connector documentation](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#reading-data-from-a-bigquery-query). ::: @@ -525,79 +212,13 @@ defs = Definitions( If you work with both Pandas and PySpark DataFrames and want a single I/O manager to handle storing and loading these DataFrames in BigQuery, you can write a new I/O manager that handles both types. To do this, inherit from the base class and implement the `type_handlers` and `default_load_type` methods. The resulting I/O manager will inherit the configuration fields of the base `BigQueryIOManager`. -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/pandas_and_pyspark.py startafter=start_example endbefore=end_example -from collections.abc import Sequence -from typing import Optional, Type - -import pandas as pd -from dagster_gcp import BigQueryIOManager -from dagster_gcp_pandas import BigQueryPandasTypeHandler -from dagster_gcp_pyspark import BigQueryPySparkTypeHandler - -from dagster import DbTypeHandler, Definitions - - -class MyBigQueryIOManager(BigQueryIOManager): - @staticmethod - def type_handlers() -> Sequence[DbTypeHandler]: - """type_handlers should return a list of the TypeHandlers that the I/O manager can use. - Here we return the BigQueryPandasTypeHandler and BigQueryPySparkTypeHandler so that the I/O - manager can store Pandas DataFrames and PySpark DataFrames. - """ - return [BigQueryPandasTypeHandler(), BigQueryPySparkTypeHandler()] - - @staticmethod - def default_load_type() -> Optional[type]: - """If an asset is not annotated with an return type, default_load_type will be used to - determine which TypeHandler to use to store and load the output. - - In this case, unannotated assets will be stored and loaded as Pandas DataFrames. - """ - return pd.DataFrame - - -defs = Definitions( - assets=[iris_data, rose_data], - resources={ - "io_manager": MyBigQueryIOManager(project="my-gcp-project", dataset="FLOWERS") - }, -) -``` + ## Executing custom SQL commands with the BigQuery resource In addition to the BigQuery I/O manager, Dagster also provides a BigQuery [resource](/guides/build/external-resources/) for executing custom SQL queries. -{/* TODO convert to */} -```python file=/integrations/bigquery/reference/resource.py -from dagster_gcp import BigQueryResource - -from dagster import Definitions, asset - -# this example executes a query against the IRIS.IRIS_DATA table created in Step 2 of the -# Using Dagster with BigQuery tutorial - - -@asset -def small_petals(bigquery: BigQueryResource): - with bigquery.get_client() as client: - return client.query( - 'SELECT * FROM IRIS.IRIS_DATA WHERE "petal_length_cm" < 1 AND' - ' "petal_width_cm" < 1', - ).result() - - -defs = Definitions( - assets=[small_petals], - resources={ - "bigquery": BigQueryResource( - project="my-gcp-project", - location="us-east5", - ) - }, -) -``` + In this example, we attach the BigQuery resource to the `small_petals` asset. In the body of the asset function, we use the `get_client` context manager method of the resource to get a [`bigquery.client.Client`](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client). We can use the client to execute a custom SQL query against the `IRIS_DATA` table created in [Step 2: Create tables in BigQuery](using-bigquery-with-dagster#step-2-create-tables-in-bigquery) of the [Using Dagster with BigQuery tutorial](using-bigquery-with-dagster). diff --git a/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/using-bigquery-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/using-bigquery-with-dagster.md index fe1f03e348483..ac4ae0ca5d780 100644 --- a/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/using-bigquery-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/gcp/bigquery/using-bigquery-with-dagster.md @@ -60,22 +60,7 @@ To use the BigQuery resource, you'll need to add it to your `Definitions` object You can also specify a `location` where computation should take place. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/resource/configuration.py startafter=start_example endbefore=end_example -from dagster_gcp import BigQueryResource - -from dagster import Definitions - -defs = Definitions( - assets=[iris_data], - resources={ - "bigquery": BigQueryResource( - project="my-gcp-project", # required - location="us-east5", # optional, defaults to the default location for the project - see https://cloud.google.com/bigquery/docs/locations for a list of locations - ) - }, -) -``` + ### Step 2: Create tables in BigQuery @@ -87,34 +72,7 @@ defs = Definitions( Using the BigQuery resource, you can create BigQuery tables using the BigQuery Python API: -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/resource/create_table.py startafter=start_example endbefore=end_example -import pandas as pd -from dagster_gcp import BigQueryResource - -from dagster import asset - - -@asset -def iris_data(bigquery: BigQueryResource) -> None: - iris_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - with bigquery.get_client() as client: - job = client.load_table_from_dataframe( - dataframe=iris_df, - destination="iris.iris_data", - ) - job.result() -``` + In this example, you're defining an asset that fetches the Iris dataset as a Pandas DataFrame and renames the columns. Then, using the BigQuery resource, the DataFrame is stored in BigQuery as the `iris.iris_data` table. @@ -128,12 +86,7 @@ Now you can run `dagster dev` and materialize the `iris_data` asset from the Dag If you already have existing tables in BigQuery and other assets defined in Dagster depend on those tables, you may want Dagster to be aware of those upstream dependencies. Making Dagster aware of these tables will allow you to track the full data lineage in Dagster. You can accomplish this by defining [external assets](/guides/build/assets/external-assets) for these tables. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/resource/source_asset.py -from dagster import AssetSpec - -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, you're creating an for a pre-existing table called `iris_harvest_data`. @@ -145,24 +98,7 @@ In this example, you're creating an */} -```python file=/integrations/bigquery/tutorial/resource/downstream.py startafter=start_example endbefore=end_example -from dagster import asset - -from .create_table import iris_data - -# this example uses the iris_dataset asset from Step 2 - - -@asset(deps=[iris_data]) -def iris_setosa(bigquery: BigQueryResource) -> None: - job_config = bq.QueryJobConfig(destination="iris.iris_setosa") - sql = "SELECT * FROM iris.iris_data WHERE species = 'Iris-setosa'" - - with bigquery.get_client() as client: - job = client.query(sql, job_config=job_config) - job.result() -``` + In this asset, you're creating second table that only contains the data for the _Iris Setosa_ species. This asset has a dependency on the `iris_data` asset. To define this dependency, you provide the `iris_data` asset as the `deps` parameter to the `iris_setosa` asset. You can then run the SQL query to create the table of _Iris Setosa_ data. @@ -170,58 +106,7 @@ In this asset, you're creating second table that only contains the data for the When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/resource/full_example.py -import pandas as pd -from dagster_gcp import BigQueryResource -from google.cloud import bigquery as bq - -from dagster import AssetSpec, Definitions, asset - -iris_harvest_data = AssetSpec(key="iris_harvest_data") - - -@asset -def iris_data(bigquery: BigQueryResource) -> None: - iris_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - with bigquery.get_client() as client: - job = client.load_table_from_dataframe( - dataframe=iris_df, - destination="iris.iris_data", - ) - job.result() - - -@asset(deps=[iris_data]) -def iris_setosa(bigquery: BigQueryResource) -> None: - job_config = bq.QueryJobConfig(destination="iris.iris_setosa") - sql = "SELECT * FROM iris.iris_data WHERE species = 'Iris-setosa'" - - with bigquery.get_client() as client: - job = client.query(sql, job_config=job_config) - job.result() - - -defs = Definitions( - assets=[iris_data, iris_setosa, iris_harvest_data], - resources={ - "bigquery": BigQueryResource( - project="my-gcp-project", - location="us-east5", - ) - }, -) -``` + ## Option 2: Using the BigQuery I/O manager @@ -243,24 +128,7 @@ To use the BigQuery I/O manager, you'll need to add it to your `Definitions` obj You can also specify a `location` where data should be stored and processed and `dataset` that should hold the created tables. You can also set a `timeout` when working with Pandas DataFrames. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/io_manager/configuration.py startafter=start_example endbefore=end_example -from dagster_gcp_pandas import BigQueryPandasIOManager - -from dagster import Definitions - -defs = Definitions( - assets=[iris_data], - resources={ - "io_manager": BigQueryPandasIOManager( - project="my-gcp-project", # required - location="us-east5", # optional, defaults to the default location for the project - see https://cloud.google.com/bigquery/docs/locations for a list of locations - dataset="IRIS", # optional, defaults to PUBLIC - timeout=15.0, # optional, defaults to None - ) - }, -) -``` + With this configuration, if you materialized an asset called `iris_data`, the BigQuery I/O manager would store the data in the `IRIS.IRIS_DATA` table in the `my-gcp-project` project. The BigQuery instance would be located in `us-east5`. @@ -280,26 +148,7 @@ The BigQuery I/O manager can create and update tables for your Dagster defined a To store data in BigQuery using the BigQuery I/O manager, you can simply return a Pandas DataFrame from your asset. Dagster will handle storing and loading your assets in BigQuery. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/io_manager/basic_example.py -import pandas as pd - -from dagster import asset - - -@asset -def iris_data() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` + In this example, you're defining an [asset](/guides/build/assets/defining-assets) that fetches the Iris dataset as a Pandas DataFrame, renames the columns, then returns the DataFrame. The type signature of the function tells the I/O manager what data type it is working with, so it is important to include the return type `pd.DataFrame`. @@ -313,12 +162,7 @@ When Dagster materializes the `iris_data` asset using the configuration from [St If you already have existing tables in BigQuery and other assets defined in Dagster depend on those tables, you may want Dagster to be aware of those upstream dependencies. Making Dagster aware of these tables will allow you to track the full data lineage in Dagster. You can define [external assets](/guides/build/assets/external-assets) for these tables. When using an I/O manager, defining an external asset for an existing table also allows you to tell Dagster how to find the table so it can be fetched for downstream assets. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/io_manager/source_asset.py -from dagster import AssetSpec - -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, you're creating a for a pre-existing table - perhaps created by an external data ingestion tool - that contains data about iris harvests. To make the data available to other Dagster assets, you need to tell the BigQuery I/O manager how to find the data, so that the I/O manager can load the data into memory. @@ -331,19 +175,7 @@ Because you already supplied the project and dataset in the I/O manager configur Once you have created an asset that represents a table in BigQuery, you will likely want to create additional assets that work with the data. Dagster and the BigQuery I/O manager allow you to load the data stored in BigQuery tables into downstream assets. -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/io_manager/load_downstream.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import asset - -# this example uses the iris_data asset from Step 2 - - -@asset -def iris_setosa(iris_data: pd.DataFrame) -> pd.DataFrame: - return iris_data[iris_data["species"] == "Iris-setosa"] -``` + In this asset, you're providing the `iris_data` asset from the [Store a Dagster asset as a table in BigQuery](#option-2-step-2) example to the `iris_setosa` asset. @@ -353,47 +185,7 @@ In this asset, you're providing the `iris_data` asset as a dependency to `iris_s When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/bigquery/tutorial/io_manager/full_example.py -import pandas as pd -from dagster_gcp_pandas import BigQueryPandasIOManager - -from dagster import AssetSpec, Definitions, asset - -iris_harvest_data = AssetSpec(key="iris_harvest_data") - - -@asset -def iris_data() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset -def iris_setosa(iris_data: pd.DataFrame) -> pd.DataFrame: - return iris_data[iris_data["species"] == "Iris-setosa"] - - -defs = Definitions( - assets=[iris_data, iris_harvest_data, iris_setosa], - resources={ - "io_manager": BigQueryPandasIOManager( - project="my-gcp-project", - location="us-east5", - dataset="IRIS", - timeout=15.0, - ) - }, -) -``` + ## Related diff --git a/docs/docs-beta/docs/integrations/libraries/jupyter/reference.md b/docs/docs-beta/docs/integrations/libraries/jupyter/reference.md index c4ceccd3259ef..067b374311989 100644 --- a/docs/docs-beta/docs/integrations/libraries/jupyter/reference.md +++ b/docs/docs-beta/docs/integrations/libraries/jupyter/reference.md @@ -12,17 +12,7 @@ For a step-by-step implementation walkthrough, refer to the [Using notebooks wit To load a Jupyter notebook as a Dagster [asset](/guides/build/assets/defining-assets), use : -{/* TODO convert to */} -```python file=/integrations/dagstermill/iris_notebook_asset.py -from dagstermill import define_dagstermill_asset - -from dagster import file_relative_path - -iris_kmeans_notebook = define_dagstermill_asset( - name="iris_kmeans", - notebook_path=file_relative_path(__file__, "../notebooks/iris-kmeans.ipynb"), -) -``` + In this code block, we use `define_dagstermill_asset` to create a Dagster asset. We provide the name for the asset with the `name` parameter and the path to our `.ipynb` file with the `notebook_path` parameter. The resulting asset will execute our notebook and store the resulting `.ipynb` file in a persistent location. @@ -30,27 +20,7 @@ In this code block, we use `define_dagstermill_asset` to create a Dagster asset. Dagstermill also supports running Jupyter notebooks as [ops](/guides/build/ops). We can use to turn a notebook into an op: -{/* TODO convert to */} -```python file=/integrations/dagstermill/iris_notebook_op.py startafter=start -from dagstermill import ConfigurableLocalOutputNotebookIOManager, define_dagstermill_op - -from dagster import file_relative_path, job - -k_means_iris = define_dagstermill_op( - name="k_means_iris", - notebook_path=file_relative_path(__file__, "./notebooks/iris-kmeans.ipynb"), - output_notebook_name="iris_kmeans_output", -) - - -@job( - resource_defs={ - "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), - } -) -def iris_classify(): - k_means_iris() -``` + In this code block, we use `define_dagstermill_op` to create an op that will execute the Jupyter notebook. We give the op the name `k_means_iris`, and provide the path to the notebook file. We also specify `output_notebook_name=iris_kmeans_output`. This means that the executed notebook will be returned in a buffered file object as one of the outputs of the op, and that output will have the name `iris_kmeans_output`. We then include the `k_means_iris` op in the `iris_classify` [job](/guides/build/jobs) and specify the `ConfigurableLocalOutputNotebookIOManager` as the `output_notebook_io_manager` to store the executed notebook file. @@ -64,25 +34,7 @@ You can use the development context to access asset and op config and resources, For instance, suppose we want to make the number of clusters (the _k_ in k-means) configurable. We'll change our asset definition to include a config field: -{/* TODO convert to */} -```python file=/integrations/dagstermill/iris_notebook_config.py startafter=start endbefore=end -from dagstermill import define_dagstermill_asset - -from dagster import AssetIn, Field, Int, file_relative_path - -iris_kmeans_jupyter_notebook = define_dagstermill_asset( - name="iris_kmeans_jupyter", - notebook_path=file_relative_path(__file__, "./notebooks/iris-kmeans.ipynb"), - group_name="template_tutorial", - ins={"iris": AssetIn("iris_dataset")}, - config_schema=Field( - Int, - default_value=3, - is_required=False, - description="The number of clusters to find", - ), -) -``` + You can also provide `config_schema` to `define_dagstermill_op` in the same way demonstrated in this code snippet. @@ -102,7 +54,7 @@ estimator = sklearn.cluster.KMeans(n_clusters=context.op_config) ## Results and custom materializations -::: +:::note The functionality described in this section only works for notebooks run with `define_dagstermill_op`. If you'd like adding this feature to `define_dagstermill_asset` to be prioritized, give this [GitHub issue](https://github.com/dagster-io/dagster/issues/10557) a thumbs up. @@ -110,44 +62,12 @@ The functionality described in this section only works for notebooks run with `d If you are using `define_dagstermill_op` and you'd like to yield a result to be consumed downstream of a notebook, you can call with the value of the result and its name. In interactive execution, this is a no-op, so you don't need to change anything when moving from interactive exploration and development to production. -{/* TODO convert to */} -```python file=/integrations/dagstermill/notebook_outputs.py startafter=start_notebook endbefore=end_notebook -# my_notebook.ipynb -import dagstermill -dagstermill.yield_result(3, output_name="my_output") -``` + And then: -{/* TODO convert to */} -```python file=/integrations/dagstermill/notebook_outputs.py startafter=start_py_file endbefore=end_py_file -from dagstermill import ConfigurableLocalOutputNotebookIOManager, define_dagstermill_op - -from dagster import Out, file_relative_path, job, op - -my_notebook_op = define_dagstermill_op( - name="my_notebook", - notebook_path=file_relative_path(__file__, "./notebooks/my_notebook.ipynb"), - output_notebook_name="output_notebook", - outs={"my_output": Out(int)}, -) - - -@op -def add_two(x): - return x + 2 - - -@job( - resource_defs={ - "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), - } -) -def my_job(): - three, _ = my_notebook_op() - add_two(three) -``` + ## Dagster events diff --git a/docs/docs-beta/docs/integrations/libraries/looker/using-looker-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/looker/using-looker-with-dagster.md index 1431f5e34faac..713f7ffe79fc8 100644 --- a/docs/docs-beta/docs/integrations/libraries/looker/using-looker-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/looker/using-looker-with-dagster.md @@ -44,21 +44,7 @@ To load Looker assets into the Dagster asset graph, you must first construct a < Dagster can automatically load all views, explores, and dashboards from your Looker instance as asset specs. Call the function, which returns a list of representing your Looker assets. You can then include these asset specs in your object: -{/* TODO convert to */} -```python file=/integrations/looker/representing-looker-assets.py -from dagster_looker import LookerResource, load_looker_asset_specs - -import dagster as dg - -looker_resource = LookerResource( - client_id=dg.EnvVar("LOOKERSDK_CLIENT_ID"), - client_secret=dg.EnvVar("LOOKERSDK_CLIENT_SECRET"), - base_url=dg.EnvVar("LOOKERSDK_HOST_URL"), -) - -looker_specs = load_looker_asset_specs(looker_resource=looker_resource) -defs = dg.Definitions(assets=[*looker_specs], resources={"looker": looker_resource}) -``` + ## Load Looker assets from filtered dashboards and explores @@ -66,77 +52,14 @@ It is possible to load a subset of your Looker assets by providing a */} -```python file=/integrations/looker/filtering-looker-assets.py -from dagster_looker import LookerFilter, LookerResource, load_looker_asset_specs - -import dagster as dg - -looker_resource = LookerResource( - client_id=dg.EnvVar("LOOKERSDK_CLIENT_ID"), - client_secret=dg.EnvVar("LOOKERSDK_CLIENT_SECRET"), - base_url=dg.EnvVar("LOOKERSDK_HOST_URL"), -) - -looker_specs = load_looker_asset_specs( - looker_resource=looker_resource, - looker_filter=LookerFilter( - dashboard_folders=[ - ["my_folder", "my_subfolder"], - ["my_folder", "my_other_subfolder"], - ], - only_fetch_explores_used_in_dashboards=True, - ), -) -defs = dg.Definitions(assets=[*looker_specs], resources={"looker": looker_resource}) -``` + ### Customize asset definition metadata for Looker assets By default, Dagster will generate asset specs for each Looker asset based on its type, and populate default metadata. You can further customize asset properties by passing a custom subclass to the function. This subclass can implement methods to customize the asset specs for each Looker asset type. -{/* TODO convert to */} -```python file=/integrations/looker/customize-looker-assets.py -from dagster_looker import ( - DagsterLookerApiTranslator, - LookerApiTranslatorStructureData, - LookerResource, - LookerStructureType, - load_looker_asset_specs, -) - -import dagster as dg - -looker_resource = LookerResource( - client_id=dg.EnvVar("LOOKERSDK_CLIENT_ID"), - client_secret=dg.EnvVar("LOOKERSDK_CLIENT_SECRET"), - base_url=dg.EnvVar("LOOKERSDK_HOST_URL"), -) - - -class CustomDagsterLookerApiTranslator(DagsterLookerApiTranslator): - def get_asset_spec( - self, looker_structure: LookerApiTranslatorStructureData - ) -> dg.AssetSpec: - # We create the default asset spec using super() - default_spec = super().get_asset_spec(looker_structure) - # We customize the team owner tag for all assets, - # and we customize the asset key prefix only for dashboards. - return default_spec.replace_attributes( - key=( - default_spec.key.with_prefix("looker") - if looker_structure.structure_type == LookerStructureType.DASHBOARD - else default_spec.key - ), - owners=["team:my_team"], - ) - - -looker_specs = load_looker_asset_specs( - looker_resource, dagster_looker_translator=CustomDagsterLookerApiTranslator() -) -defs = dg.Definitions(assets=[*looker_specs], resources={"looker": looker_resource}) -``` + + Note that `super()` is called in each of the overridden methods to generate the default asset spec. It is best practice to generate the default asset spec before customizing it. @@ -144,38 +67,8 @@ Note that `super()` is called in each of the overridden methods to generate the You can use Dagster to orchestrate the materialization of Looker PDTs. To model PDTs as assets, build their asset definitions by passing a list of to function. -{/* TODO convert to */} -```python file=/integrations/looker/materializing-looker-pdts.py -from dagster_looker import ( - LookerResource, - RequestStartPdtBuild, - build_looker_pdt_assets_definitions, - load_looker_asset_specs, -) - -import dagster as dg - -looker_resource = LookerResource( - client_id=dg.EnvVar("LOOKERSDK_CLIENT_ID"), - client_secret=dg.EnvVar("LOOKERSDK_CLIENT_SECRET"), - base_url=dg.EnvVar("LOOKERSDK_HOST_URL"), -) - -looker_specs = load_looker_asset_specs(looker_resource=looker_resource) - -pdts = build_looker_pdt_assets_definitions( - resource_key="looker", - request_start_pdt_builds=[ - RequestStartPdtBuild(model_name="my_model", view_name="my_view") - ], -) - - -defs = dg.Definitions( - assets=[*pdts, *looker_specs], - resources={"looker": looker_resource}, -) -``` + + ### Related diff --git a/docs/docs-beta/docs/integrations/libraries/openai/using-openai-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/openai/using-openai-with-dagster.md index 5b17be24d650f..fd4a6279eb42d 100644 --- a/docs/docs-beta/docs/integrations/libraries/openai/using-openai-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/openai/using-openai-with-dagster.md @@ -3,7 +3,7 @@ title: "OpenAI & Dagster" description: "The dagster-openai library provides the ability to build OpenAI pipelines with Dagster and log OpenAI API usage metadata in Dagster Insights." --- -::: +:::note This feature is considered **experimental** @@ -29,7 +29,6 @@ pip install dagster dagster-openai Note that you will need an OpenAI [API key](https://platform.openai.com/api-keys) to use the resource, which can be generated in your OpenAI account. - ## Connecting to OpenAI The first step in using OpenAI with Dagster is to tell Dagster how to connect to an OpenAI client using an OpenAI [resource](/guides/build/external-resources/). This resource contains the credentials needed to interact with OpenAI API. @@ -44,47 +43,13 @@ OPENAI_API_KEY=... Then, we can instruct Dagster to authorize the OpenAI resource using the environment variables: -```python startafter=start_example endbefore=end_example file=/integrations/openai/resource.py -from dagster_openai import OpenAIResource - -from dagster import EnvVar - -# Pull API key from environment variables -openai = OpenAIResource( - api_key=EnvVar("OPENAI_API_KEY"), -) -``` + ## Using the OpenAI resource with assets The OpenAI resource can be used in assets in order to interact with the OpenAI API. Note that in this example, we supply our credentials as environment variables directly when instantiating the object. -{/* TODO convert to */} -```python startafter=start_example endbefore=end_example file=/integrations/openai/assets.py -from dagster_openai import OpenAIResource - -from dagster import AssetExecutionContext, Definitions, EnvVar, asset, define_asset_job - - -@asset(compute_kind="OpenAI") -def openai_asset(context: AssetExecutionContext, openai: OpenAIResource): - with openai.get_client(context) as client: - client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Say this is a test."}], - ) - - -openai_asset_job = define_asset_job(name="openai_asset_job", selection="openai_asset") - -defs = Definitions( - assets=[openai_asset], - jobs=[openai_asset_job], - resources={ - "openai": OpenAIResource(api_key=EnvVar("OPENAI_API_KEY")), - }, -) -``` + After materializing your asset, your OpenAI API usage metadata will be available in the **Events** and **Plots** tabs of your asset in the Dagster UI. If you are using [Dagster+](/dagster-plus), your usage metadata will also be available in [Dagster Insights](/dagster-plus/features/insights). {/* Refer to the [Viewing and materializing assets in the UI guide](https://docs.dagster.io/guides/build/assets/defining-assets#viewing-and-materializing-assets-in-the-ui) for more information. */} @@ -98,28 +63,4 @@ Currently, the OpenAI resource doesn't (out-of-the-box) log OpenAI usage metadat ::: -{/* TODO convert to */} -```python startafter=start_example endbefore=end_example file=/integrations/openai/ops.py -from dagster_openai import OpenAIResource - -from dagster import Definitions, EnvVar, GraphDefinition, OpExecutionContext, op - - -@op -def openai_op(context: OpExecutionContext, openai: OpenAIResource): - with openai.get_client(context) as client: - client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Say this is a test"}], - ) - - -openai_op_job = GraphDefinition(name="openai_op_job", node_defs=[openai_op]).to_job() - -defs = Definitions( - jobs=[openai_op_job], - resources={ - "openai": OpenAIResource(api_key=EnvVar("OPENAI_API_KEY")), - }, -) -``` + diff --git a/docs/docs-beta/docs/integrations/libraries/pandas/using-pandas-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/pandas/using-pandas-with-dagster.md index 92fad91cd4a22..759f495d0d5e5 100644 --- a/docs/docs-beta/docs/integrations/libraries/pandas/using-pandas-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/pandas/using-pandas-with-dagster.md @@ -24,39 +24,12 @@ The `dagster_pandas` library provides the ability to perform data validation, em To create a custom `dagster_pandas` type, use `create_dagster_pandas_dataframe_type` and provide a list of `PandasColumn` objects which specify column-level schema and constraints. For example, we can construct a custom dataframe type to represent a set of e-bike trips in the following way: -{/* TODO convert to */} -```python file=/legacy/dagster_pandas_guide/core_trip.py startafter=start_core_trip_marker_0 endbefore=end_core_trip_marker_0 -TripDataFrame = create_dagster_pandas_dataframe_type( - name="TripDataFrame", - columns=[ - PandasColumn.integer_column("bike_id", min_value=0), - PandasColumn.categorical_column("color", categories={"red", "green", "blue"}), - PandasColumn.datetime_column( - "start_time", min_datetime=Timestamp(year=2020, month=2, day=10) - ), - PandasColumn.datetime_column( - "end_time", min_datetime=Timestamp(year=2020, month=2, day=10) - ), - PandasColumn.string_column("station"), - PandasColumn.exists("amount_paid"), - PandasColumn.boolean_column("was_member"), - ], -) -``` + + Once our custom data type is defined, we can use it as the type declaration for the inputs / outputs of our ops: -{/* TODO convert to */} -```python file=/legacy/dagster_pandas_guide/core_trip.py startafter=start_core_trip_marker_1 endbefore=end_core_trip_marker_1 -@op(out=Out(TripDataFrame)) -def load_trip_dataframe() -> DataFrame: - return read_csv( - file_relative_path(__file__, "./ebike_trips.csv"), - parse_dates=["start_time", "end_time"], - date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f"), - dtype={"color": "category"}, - ) -``` + By passing in these `PandasColumn` objects, we are expressing the schema and constraints we expect our dataframes to follow when Dagster performs type checks for our ops. Moreover, if we go to the op viewer, we can follow our schema documented in the UI: @@ -70,12 +43,7 @@ To do this, we provide a list of dataframe constraints to `create_dagster_pandas This looks like: -{/* TODO convert to */} -```python file=/legacy/dagster_pandas_guide/shape_constrained_trip.py startafter=start_create_type endbefore=end_create_type -ShapeConstrainedTripDataFrame = create_dagster_pandas_dataframe_type( - name="ShapeConstrainedTripDataFrame", dataframe_constraints=[RowCountConstraint(4)] -) -``` + If we rerun the above example with this dataframe, nothing should change. However, if we pass in 100 to the row count constraint, we can watch our job fail that type check. @@ -83,23 +51,7 @@ If we rerun the above example with this dataframe, nothing should change. Howeve Aside from constraint validation, `create_dagster_pandas_dataframe_type` also takes in a summary statistics function that emits metadata dictionaries which are surfaced during runs. Since data systems seldom control the quality of the data they receive, it becomes important to monitor data as it flows through your systems. In complex jobs, this can help debug and monitor data drift over time. Let's illustrate how this works in our example: -{/* TODO convert to */} -```python file=/legacy/dagster_pandas_guide/summary_stats.py startafter=start_summary endbefore=end_summary -def compute_trip_dataframe_summary_statistics(dataframe): - return { - "min_start_time": min(dataframe["start_time"]).strftime("%Y-%m-%d"), - "max_end_time": max(dataframe["end_time"]).strftime("%Y-%m-%d"), - "num_unique_bikes": str(dataframe["bike_id"].nunique()), - "n_rows": len(dataframe), - "columns": str(dataframe.columns), - } - - -SummaryStatsTripDataFrame = create_dagster_pandas_dataframe_type( - name="SummaryStatsTripDataFrame", - metadata_fn=compute_trip_dataframe_summary_statistics, -) -``` + Now if we run this job in the UI launchpad, we can see that the `SummaryStatsTripDataFrame` type is displayed in the logs along with the emitted metadata. @@ -111,36 +63,4 @@ Now if we run this job in the UI launchpad, we can see that the `SummaryStatsTri To tie this back to our example, let's say that we want to validate that the amount paid for a e-bike must be in 5 dollar increments because that is the price per mile rounded up. As a result, let's implement a `DivisibleByFiveConstraint`. To do this, all it needs is a `markdown_description` for the UI which accepts and renders markdown syntax, an `error_description` for error logs, and a validation method which throws a `ColumnConstraintViolationException` if a row fails validation. This would look like the following: -{/* TODO convert to */} -```python file=/legacy/dagster_pandas_guide/custom_column_constraint.py startafter=start_custom_col endbefore=end_custom_col -class DivisibleByFiveConstraint(ColumnConstraint): - def __init__(self): - message = "Value must be divisible by 5" - super().__init__(error_description=message, markdown_description=message) - - def validate(self, dataframe, column_name): - rows_with_unexpected_buckets = dataframe[ - dataframe[column_name].apply(lambda x: x % 5 != 0) - ] - if not rows_with_unexpected_buckets.empty: - raise ColumnConstraintViolationException( - constraint_name=self.name, - constraint_description=self.error_description, - column_name=column_name, - offending_rows=rows_with_unexpected_buckets, - ) - - -CustomTripDataFrame = create_dagster_pandas_dataframe_type( - name="CustomTripDataFrame", - columns=[ - PandasColumn( - "amount_paid", - constraints=[ - ColumnDTypeInSetConstraint({"int64"}), - DivisibleByFiveConstraint(), - ], - ) - ], -) -``` + diff --git a/docs/docs-beta/docs/integrations/libraries/pandera/using-pandera-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/pandera/using-pandera-with-dagster.md index b71dbc58050c4..cbf99190b5d1b 100644 --- a/docs/docs-beta/docs/integrations/libraries/pandera/using-pandera-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/pandera/using-pandera-with-dagster.md @@ -32,47 +32,7 @@ To get started, you'll need: The `dagster-pandera` library exposes only a single public function, `pandera_schema_to_dagster_type`, which generates Dagster types from Pandera schemas. The Dagster type wraps the Pandera schema and invokes the schema's `validate()` method inside its type check function. -{/* TODO convert to */} -```python file=/integrations/pandera/example.py -import random - -import pandas as pd -import pandera as pa -from dagster_pandera import pandera_schema_to_dagster_type -from pandera.typing import Series - -from dagster import Out, job, op - -APPLE_STOCK_PRICES = { - "name": ["AAPL", "AAPL", "AAPL", "AAPL", "AAPL"], - "date": ["2018-01-22", "2018-01-23", "2018-01-24", "2018-01-25", "2018-01-26"], - "open": [177.3, 177.3, 177.25, 174.50, 172.0], - "close": [177.0, 177.04, 174.22, 171.11, 171.51], -} - - -class StockPrices(pa.DataFrameModel): - """Open/close prices for one or more stocks by day.""" - - name: Series[str] = pa.Field(description="Ticker symbol of stock") - date: Series[str] = pa.Field(description="Date of prices") - open: Series[float] = pa.Field(ge=0, description="Price at market open") - close: Series[float] = pa.Field(ge=0, description="Price at market close") - - -@op(out=Out(dagster_type=pandera_schema_to_dagster_type(StockPrices))) -def apple_stock_prices_dirty(): - prices = pd.DataFrame(APPLE_STOCK_PRICES) - i = random.choice(prices.index) - prices.loc[i, "open"] = pd.NA - prices.loc[i, "close"] = pd.NA - return prices - - -@job -def stocks_job(): - apple_stock_prices_dirty() -``` + In the above example, we defined a toy job (`stocks_job`) with a single asset, `apple_stock_prices_dirty`. This asset returns a pandas `DataFrame` containing the opening and closing prices of Apple stock (AAPL) for a random week. The `_dirty` suffix is included because we've corrupted the data with a few random nulls. diff --git a/docs/docs-beta/docs/integrations/libraries/powerbi/using-powerbi-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/powerbi/using-powerbi-with-dagster.md index 5f7f9118b9429..1417bf33e9fa6 100644 --- a/docs/docs-beta/docs/integrations/libraries/powerbi/using-powerbi-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/powerbi/using-powerbi-with-dagster.md @@ -3,7 +3,7 @@ title: "Using Power BI with Dagster" description: Represent your Power BI assets in Dagster --- -::: +:::note This feature is considered **experimental** @@ -44,38 +44,7 @@ To load Power BI assets into the Dagster asset graph, you must first construct a Dagster can automatically load all semantic models, data sources, reports, and dashboards from your Power BI workspace as asset specs. Call the function, which returns a list of s representing your Power BI assets. You can then include these asset specs in your object: -{/* TODO convert to */} -```python file=/integrations/power-bi/representing-power-bi-assets.py -from dagster_powerbi import ( - PowerBIServicePrincipal, - PowerBIToken, - PowerBIWorkspace, - load_powerbi_asset_specs, -) - -import dagster as dg - -# Connect using a service principal -power_bi_workspace = PowerBIWorkspace( - credentials=PowerBIServicePrincipal( - client_id=dg.EnvVar("POWER_BI_CLIENT_ID"), - client_secret=dg.EnvVar("POWER_BI_CLIENT_SECRET"), - tenant_id=dg.EnvVar("POWER_BI_TENANT_ID"), - ), - workspace_id=dg.EnvVar("POWER_BI_WORKSPACE_ID"), -) - -# Alternatively, connect directly using an API access token -power_bi_workspace = PowerBIWorkspace( - credentials=PowerBIToken(api_token=dg.EnvVar("POWER_BI_API_TOKEN")), - workspace_id=dg.EnvVar("POWER_BI_WORKSPACE_ID"), -) - -power_bi_specs = load_powerbi_asset_specs(power_bi_workspace) -defs = dg.Definitions( - assets=[*power_bi_specs], resources={"power_bi": power_bi_workspace} -) -``` + By default, Dagster will attempt to snapshot your entire workspace using Power BI's [metadata scanner APIs](https://learn.microsoft.com/en-us/fabric/governance/metadata-scanning-overview), which are able to retrieve more detailed information about your Power BI assets, but rely on the workspace being configured to allow this access. @@ -85,53 +54,7 @@ If you encounter issues with the scanner APIs, you may disable them using `load_ By default, Dagster will generate asset specs for each Power BI asset based on its type, and populate default metadata. You can further customize asset properties by passing a custom subclass to the function. This subclass can implement methods to customize the asset specs for each Power BI asset type. -{/* TODO convert to */} -```python file=/integrations/power-bi/customize-power-bi-asset-defs.py -from dagster_powerbi import ( - DagsterPowerBITranslator, - PowerBIServicePrincipal, - PowerBIWorkspace, - load_powerbi_asset_specs, -) -from dagster_powerbi.translator import PowerBIContentType, PowerBITranslatorData - -import dagster as dg - -power_bi_workspace = PowerBIWorkspace( - credentials=PowerBIServicePrincipal( - client_id=dg.EnvVar("POWER_BI_CLIENT_ID"), - client_secret=dg.EnvVar("POWER_BI_CLIENT_SECRET"), - tenant_id=dg.EnvVar("POWER_BI_TENANT_ID"), - ), - workspace_id=dg.EnvVar("POWER_BI_WORKSPACE_ID"), -) - - -# A translator class lets us customize properties of the built -# Power BI assets, such as the owners or asset key -class MyCustomPowerBITranslator(DagsterPowerBITranslator): - def get_asset_spec(self, data: PowerBITranslatorData) -> dg.AssetSpec: - # We create the default asset spec using super() - default_spec = super().get_asset_spec(data) - # We customize the team owner tag for all assets, - # and we customize the asset key prefix only for dashboards. - return default_spec.replace_attributes( - key=( - default_spec.key.with_prefix("prefix") - if data.content_type == PowerBIContentType.DASHBOARD - else default_spec.key - ), - owners=["team:my_team"], - ) - - -power_bi_specs = load_powerbi_asset_specs( - power_bi_workspace, dagster_powerbi_translator=MyCustomPowerBITranslator() -) -defs = dg.Definitions( - assets=[*power_bi_specs], resources={"power_bi": power_bi_workspace} -) -``` + Note that `super()` is called in each of the overridden methods to generate the default asset spec. It is best practice to generate the default asset spec before customizing it. @@ -139,81 +62,13 @@ Note that `super()` is called in each of the overridden methods to generate the Definitions from multiple Power BI workspaces can be combined by instantiating multiple resources and merging their specs. This lets you view all your Power BI assets in a single asset graph: -{/* TODO convert to */} -```python file=/integrations/power-bi/multiple-power-bi-workspaces.py -from dagster_powerbi import ( - PowerBIServicePrincipal, - PowerBIWorkspace, - load_powerbi_asset_specs, -) - -import dagster as dg - -credentials = PowerBIServicePrincipal( - client_id=dg.EnvVar("POWER_BI_CLIENT_ID"), - client_secret=dg.EnvVar("POWER_BI_CLIENT_SECRET"), - tenant_id=dg.EnvVar("POWER_BI_TENANT_ID"), -) - -sales_team_workspace = PowerBIWorkspace( - credentials=credentials, - workspace_id="726c94ff-c408-4f43-8edf-61fbfa1753c7", -) - -marketing_team_workspace = PowerBIWorkspace( - credentials=credentials, - workspace_id="8b7f815d-4e64-40dd-993c-cfa4fb12edee", -) - -sales_team_specs = load_powerbi_asset_specs(sales_team_workspace) -marketing_team_specs = load_powerbi_asset_specs(marketing_team_workspace) - -# Merge the specs into a single set of definitions -defs = dg.Definitions( - assets=[*sales_team_specs, *marketing_team_specs], - resources={ - "marketing_power_bi": marketing_team_workspace, - "sales_power_bi": sales_team_workspace, - }, -) -``` + ## Materialize Power BI semantic models from Dagster Dagster's default behavior is to pull in representations of Power BI semantic models as external assets, which appear in the asset graph but can't be materialized. However, you can build executable asset definitions that trigger the refresh of Power BI semantic models. The utility will construct an asset definition that triggers a refresh of a semantic model when materialized. -{/* TODO convert to */} -```python file=/integrations/power-bi/materialize-semantic-models.py -from dagster_powerbi import ( - PowerBIServicePrincipal, - PowerBIWorkspace, - build_semantic_model_refresh_asset_definition, - load_powerbi_asset_specs, -) - -import dagster as dg - -power_bi_workspace = PowerBIWorkspace( - credentials=PowerBIServicePrincipal( - client_id=dg.EnvVar("POWER_BI_CLIENT_ID"), - client_secret=dg.EnvVar("POWER_BI_CLIENT_SECRET"), - tenant_id=dg.EnvVar("POWER_BI_TENANT_ID"), - ), - workspace_id=dg.EnvVar("POWER_BI_WORKSPACE_ID"), -) - -# Load Power BI asset specs, and use the asset definition builder to -# construct a semantic model refresh definition for each semantic model -power_bi_assets = [ - build_semantic_model_refresh_asset_definition(resource_key="power_bi", spec=spec) - if spec.tags.get("dagster-powerbi/asset_type") == "semantic_model" - else spec - for spec in load_powerbi_asset_specs(power_bi_workspace) -] -defs = dg.Definitions( - assets=[*power_bi_assets], resources={"power_bi": power_bi_workspace} -) -``` + You can then add these semantic models to jobs or as targets of Dagster sensors or schedules to trigger refreshes of the models on a cadence or based on other conditions. @@ -221,53 +76,4 @@ You can then add these semantic models to jobs or as targets of Dagster sensors Instead of using the out-of-the-box utility, you can build your own asset definitions that trigger the refresh of Power BI semantic models. This allows you to customize how the refresh is triggered or to run custom code before or after the refresh. -{/* TODO convert to */} -```python file=/integrations/power-bi/materialize-semantic-models-advanced.py -from dagster_powerbi import ( - PowerBIServicePrincipal, - PowerBIWorkspace, - build_semantic_model_refresh_asset_definition, - load_powerbi_asset_specs, -) - -import dagster as dg - -power_bi_workspace = PowerBIWorkspace( - credentials=PowerBIServicePrincipal( - client_id=dg.EnvVar("POWER_BI_CLIENT_ID"), - client_secret=dg.EnvVar("POWER_BI_CLIENT_SECRET"), - tenant_id=dg.EnvVar("POWER_BI_TENANT_ID"), - ), - workspace_id=dg.EnvVar("POWER_BI_WORKSPACE_ID"), -) - - -# Asset definition factory which triggers a semantic model refresh and sends a notification -# once complete -def build_semantic_model_refresh_and_notify_asset_def( - spec: dg.AssetSpec, -) -> dg.AssetsDefinition: - dataset_id = spec.metadata["dagster-powerbi/id"] - - @dg.multi_asset(specs=[spec], name=spec.key.to_python_identifier()) - def rebuild_semantic_model( - context: dg.AssetExecutionContext, power_bi: PowerBIWorkspace - ) -> None: - power_bi.trigger_and_poll_refresh(dataset_id) - # Do some custom work after refreshing here, such as sending an email notification - - return rebuild_semantic_model - - -# Load Power BI asset specs, and use our custom asset definition builder to -# construct a definition for each semantic model -power_bi_assets = [ - build_semantic_model_refresh_and_notify_asset_def(spec=spec) - if spec.tags.get("dagster-powerbi/asset_type") == "semantic_model" - else spec - for spec in load_powerbi_asset_specs(power_bi_workspace) -] -defs = dg.Definitions( - assets=[*power_bi_assets], resources={"power_bi": power_bi_workspace} -) -``` + diff --git a/docs/docs-beta/docs/integrations/libraries/sigma/using-sigma-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/sigma/using-sigma-with-dagster.md index e782545bef8bd..959c952e9ad72 100644 --- a/docs/docs-beta/docs/integrations/libraries/sigma/using-sigma-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/sigma/using-sigma-with-dagster.md @@ -3,7 +3,7 @@ title: "Using Sigma with Dagster" description: Represent your Sigma assets in Dagster --- -::: +:::note This feature is considered **experimental** @@ -42,21 +42,7 @@ To load Sigma assets into the Dagster asset graph, you must first construct a

      function, which returns list of s representing your Sigma assets. You can then include these asset specs in your object: -{/* TODO convert to */} -```python file=/integrations/sigma/representing-sigma-assets.py -from dagster_sigma import SigmaBaseUrl, SigmaOrganization, load_sigma_asset_specs - -import dagster as dg - -sigma_organization = SigmaOrganization( - base_url=SigmaBaseUrl.AWS_US, - client_id=dg.EnvVar("SIGMA_CLIENT_ID"), - client_secret=dg.EnvVar("SIGMA_CLIENT_SECRET"), -) - -sigma_specs = load_sigma_asset_specs(sigma_organization) -defs = dg.Definitions(assets=[*sigma_specs], resources={"sigma": sigma_organization}) -``` + ## Load Sigma assets from filtered workbooks @@ -64,76 +50,13 @@ It is possible to load a subset of your Sigma assets by providing a */} -```python file=/integrations/sigma/filtering-sigma-assets.py -from dagster_sigma import ( - SigmaBaseUrl, - SigmaFilter, - SigmaOrganization, - load_sigma_asset_specs, -) - -import dagster as dg - -sigma_organization = SigmaOrganization( - base_url=SigmaBaseUrl.AWS_US, - client_id=dg.EnvVar("SIGMA_CLIENT_ID"), - client_secret=dg.EnvVar("SIGMA_CLIENT_SECRET"), -) - -sigma_specs = load_sigma_asset_specs( - organization=sigma_organization, - sigma_filter=SigmaFilter( - # Filter down to only the workbooks in these folders - workbook_folders=[ - ("my_folder", "my_subfolder"), - ("my_folder", "my_other_subfolder"), - ], - # Specify whether to include datasets that are not used in any workbooks - # default is True - include_unused_datasets=False, - ), -) -defs = dg.Definitions(assets=[*sigma_specs], resources={"sigma": sigma_organization}) -``` + ### Customize asset definition metadata for Sigma assets By default, Dagster will generate asset specs for each Sigma asset based on its type, and populate default metadata. You can further customize asset properties by passing a custom subclass to the function. This subclass can implement methods to customize the asset specs for each Sigma asset type. -{/* TODO convert to */} -```python file=/integrations/sigma/customize-sigma-asset-defs.py -from dagster_sigma import ( - DagsterSigmaTranslator, - SigmaBaseUrl, - SigmaOrganization, - SigmaWorkbookTranslatorData, - load_sigma_asset_specs, -) - -import dagster as dg - -sigma_organization = SigmaOrganization( - base_url=SigmaBaseUrl.AWS_US, - client_id=dg.EnvVar("SIGMA_CLIENT_ID"), - client_secret=dg.EnvVar("SIGMA_CLIENT_SECRET"), -) - - -# A translator class lets us customize properties of the built Sigma assets, such as the owners or asset key -class MyCustomSigmaTranslator(DagsterSigmaTranslator): - def get_asset_spec(self, data: SigmaWorkbookTranslatorData) -> dg.AssetSpec: - # We create the default asset spec using super() - default_spec = super().get_asset_spec(data) - # we customize the team owner tag for all Sigma assets - return default_spec.replace_attributes(owners=["team:my_team"]) - - -sigma_specs = load_sigma_asset_specs( - sigma_organization, dagster_sigma_translator=MyCustomSigmaTranslator() -) -defs = dg.Definitions(assets=[*sigma_specs], resources={"sigma": sigma_organization}) -``` + Note that `super()` is called in each of the overridden methods to generate the default asset spec. It is best practice to generate the default asset spec before customizing it. @@ -141,36 +64,7 @@ Note that `super()` is called in each of the overridden methods to generate the Definitions from multiple Sigma organizations can be combined by instantiating multiple resources and merging their specs. This lets you view all your Sigma assets in a single asset graph: -{/* TODO convert to */} -```python file=/integrations/sigma/multiple-sigma-organizations.py -from dagster_sigma import SigmaBaseUrl, SigmaOrganization, load_sigma_asset_specs - -import dagster as dg - -sales_team_organization = SigmaOrganization( - base_url=SigmaBaseUrl.AWS_US, - client_id=dg.EnvVar("SALES_SIGMA_CLIENT_ID"), - client_secret=dg.EnvVar("SALES_SIGMA_CLIENT_SECRET"), -) - -marketing_team_organization = SigmaOrganization( - base_url=SigmaBaseUrl.AWS_US, - client_id=dg.EnvVar("MARKETING_SIGMA_CLIENT_ID"), - client_secret=dg.EnvVar("MARKETING_SIGMA_CLIENT_SECRET"), -) - -sales_team_specs = load_sigma_asset_specs(sales_team_organization) -marketing_team_specs = load_sigma_asset_specs(marketing_team_organization) - -# Merge the specs into a single set of definitions -defs = dg.Definitions( - assets=[*sales_team_specs, *marketing_team_specs], - resources={ - "marketing_sigma": marketing_team_organization, - "sales_sigma": sales_team_organization, - }, -) -``` + ### Related diff --git a/docs/docs-beta/docs/integrations/libraries/snowflake/reference.md b/docs/docs-beta/docs/integrations/libraries/snowflake/reference.md index 3286ed0f32d71..557598528ef31 100644 --- a/docs/docs-beta/docs/integrations/libraries/snowflake/reference.md +++ b/docs/docs-beta/docs/integrations/libraries/snowflake/reference.md @@ -17,94 +17,22 @@ Currently, the Dagster's Snowflake integration only supports encrypted private k **Directly to the resource** -{/* TODO convert to */} -```python file=/integrations/snowflake/private_key_auth_resource.py startafter=start_direct_key endbefore=end_direct_key -from dagster_snowflake import SnowflakeResource - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "snowflake": SnowflakeResource( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - private_key=EnvVar("SNOWFLAKE_PK"), - private_key_password=EnvVar("SNOWFLAKE_PK_PASSWORD"), - database="FLOWERS", - ) - }, -) -``` + **Via a file** -{/* TODO convert to */} -```python file=/integrations/snowflake/private_key_auth_resource.py startafter=start_key_file endbefore=end_key_file -from dagster_snowflake import SnowflakeResource - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "snowflake": SnowflakeResource( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - private_key_path="/path/to/private/key/file.p8", - private_key_password=EnvVar("SNOWFLAKE_PK_PASSWORD"), - database="FLOWERS", - ) - }, -) -``` + **Directly to the I/O manager** -{/* TODO convert to */} -```python file=/integrations/snowflake/private_key_auth_io_manager.py startafter=start_direct_key endbefore=end_direct_key -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - private_key=EnvVar("SNOWFLAKE_PK"), - private_key_password=EnvVar("SNOWFLAKE_PK_PASSWORD"), - database="FLOWERS", - ) - }, -) -``` + **Via a file** -{/* TODO convert to */} -```python file=/integrations/snowflake/private_key_auth_io_manager.py startafter=start_key_file endbefore=end_key_file -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - private_key_path="/path/to/private/key/file.p8", - private_key_password=EnvVar("SNOWFLAKE_PK_PASSWORD"), - database="FLOWERS", - ) - }, -) -``` + @@ -115,43 +43,7 @@ defs = Definitions( Using a [Snowflake resource](/api/python-api/libraries/dagster-snowflake#resource), you can execute custom SQL queries on a Snowflake database: -{/* TODO convert to */} -```python file=/integrations/snowflake/resource.py startafter=start endbefore=end -from dagster_snowflake import SnowflakeResource - -from dagster import Definitions, EnvVar, asset - -# this example executes a query against the IRIS_DATASET table created in Step 2 of the -# Using Dagster with Snowflake tutorial - - -@asset -def small_petals(snowflake: SnowflakeResource): - query = """ - create or replace table iris.small_petals as ( - SELECT * - FROM iris.iris_dataset - WHERE species = 'petal_length_cm' < 1 AND 'petal_width_cm' < 1 - ); - """ - - with snowflake.get_connection() as conn: - conn.cursor.execute(query) - - -defs = Definitions( - assets=[small_petals], - resources={ - "snowflake": SnowflakeResource( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - database="FLOWERS", - schema="IRIS", - ) - }, -) -``` + Let's review what's happening in this example: @@ -161,36 +53,13 @@ Let's review what's happening in this example: For more information on the Snowflake resource, including additional configuration settings, see the API docs. - ## Using the Snowflake I/O manager ### Selecting specific columns in a downstream asset Sometimes you may not want to fetch an entire table as the input to a downstream asset. With the Snowflake I/O manager, you can select specific columns to load by supplying metadata on the downstream asset. -{/* TODO convert to */} -```python file=/integrations/snowflake/downstream_columns.py -import pandas as pd - -from dagster import AssetIn, asset - -# this example uses the iris_dataset asset from Step 2 of the Using Dagster with Snowflake tutorial - - -@asset( - ins={ - "iris_sepal": AssetIn( - key="iris_dataset", - metadata={"columns": ["sepal_length_cm", "sepal_width_cm"]}, - ) - } -) -def sepal_data(iris_sepal: pd.DataFrame) -> pd.DataFrame: - iris_sepal["sepal_area_cm2"] = ( - iris_sepal["sepal_length_cm"] * iris_sepal["sepal_width_cm"] - ) - return iris_sepal -``` + In this example, we only use the columns containing sepal data from the `IRIS_DATASET` table created in [Step 2](using-snowflake-with-dagster-io-managers#store-a-dagster-asset-as-a-table-in-snowflake) of the [Snowflake I/O manager tutorial](using-snowflake-with-dagster-io-managers). Fetching the entire table would be unnecessarily costly, so to select specific columns, we can add metadata to the input asset. We do this in the `metadata` parameter of the `AssetIn` that loads the `iris_dataset` asset in the `ins` parameter. We supply the key `columns` with a list of names of the columns we want to fetch. @@ -205,40 +74,7 @@ The Snowflake I/O manager supports storing and loading partitioned data. In orde To store statically-partitioned assets in Snowflake, specify `partition_expr` metadata on the asset to tell the Snowflake I/O manager which column contains the partition data: -{/* TODO convert to CodeExample */} -```python file=/integrations/snowflake/static_partition.py -import pandas as pd - -from dagster import AssetExecutionContext, StaticPartitionsDefinition, asset - - -@asset( - partitions_def=StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - metadata={"partition_expr": "SPECIES"}, -) -def iris_dataset_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - species = context.partition_key - - full_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - return full_df[full_df["Species"] == species] - - -@asset -def iris_cleaned(iris_dataset_partitioned: pd.DataFrame): - return iris_dataset_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the partition in the downstream asset. When loading a static partition (or multiple static partitions), the following statement is used: @@ -261,30 +97,7 @@ SELECT * Like statically-partitioned assets, you can specify `partition_expr` metadata on the asset to tell the Snowflake I/O manager which column contains the partition data: -{/* TODO convert to CodeExample */} -```python file=/integrations/snowflake/time_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import AssetExecutionContext, DailyPartitionsDefinition, asset - - -@asset( - partitions_def=DailyPartitionsDefinition(start_date="2023-01-01"), - metadata={"partition_expr": "TO_TIMESTAMP(TIME::INT)"}, -) -def iris_data_per_day(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'time' with that stores - # the time of the row as an integer of seconds since epoch - return get_iris_data_for_date(partition) - - -@asset -def iris_cleaned(iris_data_per_day: pd.DataFrame): - return iris_data_per_day.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in the downstream asset. When loading a dynamic partition, the following statement is used: @@ -311,50 +124,7 @@ In this example, the data in the `TIME` column are integers, so the `partition_e The Snowflake I/O manager can also store data partitioned on multiple dimensions. To do this, you must specify the column for each partition as a dictionary of `partition_expr` metadata: -{/* TODO convert to CodeExample */} -```python file=/integrations/snowflake/multi_partition.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import ( - AssetExecutionContext, - DailyPartitionsDefinition, - MultiPartitionKey, - MultiPartitionsDefinition, - StaticPartitionsDefinition, - asset, -) - - -@asset( - partitions_def=MultiPartitionsDefinition( - { - "date": DailyPartitionsDefinition(start_date="2023-01-01"), - "species": StaticPartitionsDefinition( - ["Iris-setosa", "Iris-virginica", "Iris-versicolor"] - ), - } - ), - metadata={ - "partition_expr": {"date": "TO_TIMESTAMP(TIME::INT)", "species": "SPECIES"} - }, -) -def iris_dataset_partitioned(context: AssetExecutionContext) -> pd.DataFrame: - partition = context.partition_key.keys_by_dimension - species = partition["species"] - date = partition["date"] - - # get_iris_data_for_date fetches all of the iris data for a given date, - # the returned dataframe contains a column named 'time' with that stores - # the time of the row as an integer of seconds since epoch - full_df = get_iris_data_for_date(date) - - return full_df[full_df["species"] == species] - - -@asset -def iris_cleaned(iris_dataset_partitioned: pd.DataFrame): - return iris_dataset_partitioned.dropna().drop_duplicates() -``` + Dagster uses the `partition_expr` metadata to craft the `SELECT` statement when loading the correct partition in a downstream asset. For multi-partitions, Dagster concatenates the `WHERE` statements described in the above sections to craft the correct `SELECT` statement. @@ -378,45 +148,13 @@ You can specify the default schema where data will be stored as configuration to To store assets in different schemas, specify the schema as metadata: -{/* TODO convert to */} -```python file=/integrations/snowflake/schema.py startafter=start_metadata endbefore=end_metadata dedent=4 -daffodil_dataset = AssetSpec( - key=["daffodil_dataset"], metadata={"schema": "daffodil"} -) - -@asset(metadata={"schema": "iris"}) -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` +{/* TODO add dedent=4 prop to CodeExample below */} + You can also specify the schema as part of the asset's asset key: -{/* TODO convert to */} -```python file=/integrations/snowflake/schema.py startafter=start_asset_key endbefore=end_asset_key dedent=4 -daffodil_dataset = AssetSpec(key=["daffodil", "daffodil_dataset"]) - -@asset(key_prefix=["iris"]) -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` +{/* TODO add dedent=4 prop to CodeExample below */} + In this example, the `iris_dataset` asset will be stored in the `IRIS` schema, and the `daffodil_dataset` asset will be found in the `DAFFODIL` schema. @@ -452,50 +190,7 @@ Prior to `dagster-snowflake` version `0.19.0` the Snowflake I/O manager converte You may have assets that you don't want to store in Snowflake. You can provide an I/O manager to each asset using the `io_manager_key` parameter in the `asset` decorator: -{/* TODO convert to */} -```python file=/integrations/snowflake/multiple_io_managers.py startafter=start_example endbefore=end_example -import pandas as pd -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import Definitions, EnvVar, asset - - -@asset(io_manager_key="warehouse_io_manager") -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset(io_manager_key="blob_io_manager") -def iris_plots(iris_dataset): - # plot_data is a function we've defined somewhere else - # that plots the data in a DataFrame - return plot_data(iris_dataset) - - -defs = Definitions( - assets=[iris_dataset, iris_plots], - resources={ - "warehouse_io_manager": SnowflakePandasIOManager( - database="FLOWERS", - schema="IRIS", - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - ), - "blob_io_manager": s3_pickle_io_manager, - }, -) -``` + In this example, the `iris_dataset` asset uses the I/O manager bound to the key `warehouse_io_manager` and `iris_plots` will use the I/O manager bound to the key `blob_io_manager`. In the object, we supply the I/O managers for those keys. When the assets are materialized, the `iris_dataset` will be stored in Snowflake, and `iris_plots` will be saved in Amazon S3. @@ -509,27 +204,7 @@ pip install dagster-snowflake-pyspark Then you can use the `SnowflakePySparkIOManager` in your `Definitions` as in [Step 1](using-snowflake-with-dagster-io-managers#step-1-configure-the-snowflake-io-manager) of the [Snowflake I/O manager tutorial](using-snowflake-with-dagster-io-managers). -{/* TODO convert to */} -```python file=/integrations/snowflake/pyspark_configuration.py startafter=start_configuration endbefore=end_configuration -from dagster_snowflake_pyspark import SnowflakePySparkIOManager - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePySparkIOManager( - account="abc1234.us-east-1", # required - user=EnvVar("SNOWFLAKE_USER"), # required - password=EnvVar("SNOWFLAKE_PASSWORD"), # password or private key required - database="FLOWERS", # required - warehouse="PLANTS", # required for PySpark - role="writer", # optional, defaults to the default role for the account - schema="IRIS", # optional, defaults to PUBLIC - ) - }, -) -``` + :::note @@ -542,109 +217,12 @@ The `SnowflakePySparkIOManager` requires that a `SparkSession` be active and con -{/* TODO convert to CodeExample */} -```python file=/integrations/snowflake/pyspark_with_spark_resource.py -from dagster_pyspark import pyspark_resource -from dagster_snowflake_pyspark import SnowflakePySparkIOManager -from pyspark import SparkFiles -from pyspark.sql import DataFrame -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import AssetExecutionContext, Definitions, EnvVar, asset - -SNOWFLAKE_JARS = "net.snowflake:snowflake-jdbc:3.8.0,net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0" - - -@asset(required_resource_keys={"pyspark"}) -def iris_dataset(context: AssetExecutionContext) -> DataFrame: - spark = context.resources.pyspark.spark_session - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = "https://docs.dagster.io/assets/iris.csv" - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePySparkIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - database="FLOWERS", - warehouse="PLANTS", - schema="IRIS", - ), - "pyspark": pyspark_resource.configured( - {"spark_conf": {"spark.jars.packages": SNOWFLAKE_JARS}} - ), - }, -) -``` + -{/* TODO convert to CodeExample */} -```python file=/integrations/snowflake/pyspark_with_spark_session.py -from dagster_snowflake_pyspark import SnowflakePySparkIOManager -from pyspark import SparkFiles -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import DoubleType, StringType, StructField, StructType - -from dagster import Definitions, EnvVar, asset - -SNOWFLAKE_JARS = "net.snowflake:snowflake-jdbc:3.8.0,net.snowflake:spark-snowflake_2.12:2.8.2-spark_3.0" - - -@asset -def iris_dataset() -> DataFrame: - spark = SparkSession.builder.config( - key="spark.jars.packages", - value=SNOWFLAKE_JARS, - ).getOrCreate() - - schema = StructType( - [ - StructField("sepal_length_cm", DoubleType()), - StructField("sepal_width_cm", DoubleType()), - StructField("petal_length_cm", DoubleType()), - StructField("petal_width_cm", DoubleType()), - StructField("species", StringType()), - ] - ) - - url = ("https://docs.dagster.io/assets/iris.csv",) - spark.sparkContext.addFile(url) - - return spark.read.schema(schema).csv("file://" + SparkFiles.get("iris.csv")) - - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePySparkIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - database="FLOWERS", - warehouse="PLANTS", - schema="IRIS", - ), - }, -) -``` + @@ -653,48 +231,4 @@ defs = Definitions( If you work with both Pandas and PySpark DataFrames and want a single I/O manager to handle storing and loading these DataFrames in Snowflake, you can write a new I/O manager that handles both types. To do this, inherit from the base class and implement the `type_handlers` and `default_load_type` methods. The resulting I/O manager will inherit the configuration fields of the base `SnowflakeIOManager`. -{/* TODO convert to */} -```python file=/integrations/snowflake/pandas_and_pyspark.py startafter=start_example endbefore=end_example -from typing import Optional, Type - -import pandas as pd -from dagster_snowflake import SnowflakeIOManager -from dagster_snowflake_pandas import SnowflakePandasTypeHandler -from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler - -from dagster import Definitions, EnvVar - - -class SnowflakePandasPySparkIOManager(SnowflakeIOManager): - @staticmethod - def type_handlers(): - """type_handlers should return a list of the TypeHandlers that the I/O manager can use. - Here we return the SnowflakePandasTypeHandler and SnowflakePySparkTypeHandler so that the I/O - manager can store Pandas DataFrames and PySpark DataFrames. - """ - return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()] - - @staticmethod - def default_load_type() -> Optional[type]: - """If an asset is not annotated with an return type, default_load_type will be used to - determine which TypeHandler to use to store and load the output. - In this case, unannotated assets will be stored and loaded as Pandas DataFrames. - """ - return pd.DataFrame - - -defs = Definitions( - assets=[iris_dataset, rose_dataset], - resources={ - "io_manager": SnowflakePandasPySparkIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - database="FLOWERS", - role="writer", - warehouse="PLANTS", - schema="IRIS", - ) - }, -) -``` + diff --git a/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster-io-managers.md b/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster-io-managers.md index 2f7bd07816fbf..43a687fd38884 100644 --- a/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster-io-managers.md +++ b/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster-io-managers.md @@ -47,34 +47,13 @@ To complete this tutorial, you'll need: For more information on authenticating with a private key, see [Authenticating with a private key](reference#authenticating-using-a-private-key) in the Snowflake reference guide. - ## Step 1: Configure the Snowflake I/O manager The Snowflake I/O manager requires some configuration to connect to your Snowflake instance. The `account`, `user` are required to connect with Snowflake. One method of authentication is required. You can use a password or a private key. Additionally, you need to specify a `database` to where all the tables should be stored. You can also provide some optional configuration to further customize the Snowflake I/O manager. You can specify a `warehouse` and `schema` where data should be stored, and a `role` for the I/O manager. -{/* TODO convert to */} -```python file=/integrations/snowflake/io_manager_tutorial/configuration.py startafter=start_example endbefore=end_example -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import Definitions, EnvVar - -defs = Definitions( - assets=[iris_dataset], - resources={ - "io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", # required - user=EnvVar("SNOWFLAKE_USER"), # required - password=EnvVar("SNOWFLAKE_PASSWORD"), # password or private key required - database="FLOWERS", # required - role="writer", # optional, defaults to the default role for the account - warehouse="PLANTS", # optional, defaults to default warehouse for the account - schema="IRIS", # optional, defaults to PUBLIC - ) - }, -) -``` + With this configuration, if you materialized an asset called `iris_dataset`, the Snowflake I/O manager would be permissioned with the role `writer` and would store the data in the `FLOWERS.IRIS.IRIS_DATASET` table in the `PLANTS` warehouse. @@ -82,8 +61,6 @@ Finally, in the API documentation. - - ## Step 2: Create tables in Snowflake The Snowflake I/O manager can create and update tables for your Dagster defined assets, but you can also make existing Snowflake tables available to Dagster. @@ -96,26 +73,7 @@ The Snowflake I/O manager can create and update tables for your Dagster defined To store data in Snowflake using the Snowflake I/O manager, the definitions of your assets don't need to change. You can tell Dagster to use the Snowflake I/O manager, like in [Step 1: Configure the Snowflake I/O manager](#step-1-configure-the-snowflake-io-manager), and Dagster will handle storing and loading your assets in Snowflake. -{/* TODO convert to */} -```python file=/integrations/snowflake/io_manager_tutorial/create_table.py -import pandas as pd - -from dagster import asset - - -@asset -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) -``` + In this example, we first define our [asset](/guides/build/assets/defining-assets). Here, we are fetching the Iris dataset as a Pandas DataFrame and renaming the columns. The type signature of the function tells the I/O manager what data type it is working with, so it is important to include the return type `pd.DataFrame`. @@ -127,12 +85,7 @@ When Dagster materializes the `iris_dataset` asset using the configuration from You may already have tables in Snowflake that you want to make available to other Dagster assets. You can define [external assets](/guides/build/assets/external-assets) for these tables. By defining an external asset for the existing table, you tell Dagster how to find the table so it can be fetched for downstream assets. -{/* TODO convert to */} -```python file=/integrations/snowflake/source_asset.py -from dagster import AssetSpec - -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, we create a for a pre-existing table - perhaps created by an external data ingestion tool - that contains data about iris harvests. To make the data available to other Dagster assets, we need to tell the Snowflake I/O manager how to find the data. @@ -145,19 +98,7 @@ Since we supply the database and the schema in the I/O manager configuration in Once you have created an asset that represents a table in Snowflake, you will likely want to create additional assets that work with the data. Dagster and the Snowflake I/O manager allow you to load the data stored in Snowflake tables into downstream assets. -{/* TODO convert to */} -```python file=/integrations/snowflake/io_manager_tutorial/downstream.py startafter=start_example endbefore=end_example -import pandas as pd - -from dagster import asset - -# this example uses the iris_dataset asset from Step 2 - - -@asset -def iris_cleaned(iris_dataset: pd.DataFrame) -> pd.DataFrame: - return iris_dataset.dropna().drop_duplicates() -``` + In this example, we want to provide the `iris_dataset` asset from the [Store a Dagster asset as a table in Snowflake](#store-a-dagster-asset-as-a-table-in-snowflake) example to the `iris_cleaned` asset. In `iris_cleaned`, the `iris_dataset` parameter tells Dagster that the value for the `iris_dataset` asset should be provided as input to `iris_cleaned`. @@ -167,47 +108,4 @@ When materializing these assets, Dagster will use the `SnowflakePandasIOManager` When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/snowflake/io_manager_tutorial/full_example.py -import pandas as pd -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import AssetSpec, Definitions, EnvVar, asset - -iris_harvest_data = AssetSpec(key="iris_harvest_data") - - -@asset -def iris_dataset() -> pd.DataFrame: - return pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "sepal_width_cm", - "petal_length_cm", - "petal_width_cm", - "species", - ], - ) - - -@asset -def iris_cleaned(iris_dataset: pd.DataFrame) -> pd.DataFrame: - return iris_dataset.dropna().drop_duplicates() - - -defs = Definitions( - assets=[iris_dataset, iris_harvest_data, iris_cleaned], - resources={ - "io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user=EnvVar("SNOWFLAKE_USER"), - password=EnvVar("SNOWFLAKE_PASSWORD"), - database="FLOWERS", - role="writer", - warehouse="PLANTS", - schema="IRIS", - ) - }, -) -``` + diff --git a/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster.md b/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster.md index c48d2334ab58b..52badcf7bbf33 100644 --- a/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster.md +++ b/docs/docs-beta/docs/integrations/libraries/snowflake/using-snowflake-with-dagster.md @@ -53,22 +53,7 @@ To connect to Snowflake, we'll use the `dagster-snowflake` With this configuration, if you materialized an asset named `iris_dataset`, would use the role `WRITER` and store the data in the `FLOWERS.IRIS.IRIS_DATASET` table using the `PLANTS` warehouse. @@ -81,44 +66,7 @@ For more info about each of the configuration values, refer to the */} -```python file=/integrations/snowflake/resource_tutorial/full_example.py startafter=start_asset endbefore=end_asset -import pandas as pd -from dagster_snowflake import SnowflakeResource -from snowflake.connector.pandas_tools import write_pandas - -from dagster import MaterializeResult, asset - - -@asset -def iris_dataset(snowflake: SnowflakeResource): - iris_df = pd.read_csv( - "https://docs.dagster.io/assets/iris.csv", - names=[ - "sepal_length_cm", - "species", - ], - ) - - with snowflake.get_connection() as conn: - table_name = "iris_dataset" - database = "flowers" - schema = "iris" - success, number_chunks, rows_inserted, output = write_pandas( - conn, - iris_df, - table_name=table_name, - database=database, - schema=schema, - auto_create_table=True, - overwrite=True, - quote_identifiers=False, - ) - - return MaterializeResult( - metadata={"rows_inserted": rows_inserted}, - ) -``` + In this example, we've defined an asset that fetches the Iris dataset as a Pandas DataFrame. Then, using the Snowflake resource, the DataFrame is stored in Snowflake as the `FLOWERS.IRIS.IRIS_DATASET` table. @@ -129,12 +77,7 @@ If you have existing tables in Snowflake and other assets defined in Dagster dep Making Dagster aware of these tables allows you to track the full data lineage in Dagster. You can accomplish this by defining [external assets](/guides/build/assets/external-assets) for these tables. For example: -{/* TODO convert to */} -```python file=/integrations/snowflake/source_asset.py -from dagster import AssetSpec - -iris_harvest_data = AssetSpec(key="iris_harvest_data") -``` + In this example, we created a for a pre-existing table called `iris_harvest_data`. @@ -147,26 +90,7 @@ Since we supplied the database and the schema in the resource configuration in [ Once you've created an asset that represents a table in Snowflake, you may want to create additional assets that work with the data. In the following example, we've defined an asset that creates a second table, which contains only the data for the _Iris Setosa_ species: -{/* TODO convert to */} -```python file=/integrations/snowflake/resource_tutorial/full_example.py startafter=start_downstream endbefore=end_downstream -from dagster_snowflake import SnowflakeResource - -from dagster import asset - - -@asset(deps=["iris_dataset"]) -def iris_setosa(snowflake: SnowflakeResource) -> None: - query = """ - create or replace table iris.iris_setosa as ( - SELECT * - FROM iris.iris_dataset - WHERE species = 'Iris-setosa' - ); - """ - - with snowflake.get_connection() as conn: - conn.cursor.execute(query) -``` + To accomplish this, we defined a dependency on the `iris_dataset` asset using the `deps` parameter. Then, the SQL query runs and creates the table of _Iris Setosa_ data. @@ -174,14 +98,7 @@ To accomplish this, we defined a dependency on the `iris_dataset` asset using th The last step is to add the and the assets to the project's object: -{/* TODO convert to */} -```python file=/integrations/snowflake/resource_tutorial/full_example.py startafter=start_definitions endbefore=end_definitions -from dagster import Definitions - -defs = Definitions( - assets=[iris_dataset, iris_setosa], resources={"snowflake": snowflake} -) -``` + This makes the resource and assets available to Dagster tools like the UI and CLI. @@ -189,8 +106,8 @@ This makes the resource and assets available to Dagster tools like the UI and CL When finished, your code should look like the following: -{/* TODO convert to */} -```python file=/integrations/snowflake/resource_tutorial/full_example.py lines=1,4-16,27-58,67-80,86-88 +{/* TODO convert to CodeExample when 'lines' property implemented */} +```python file=docs_snippets/docs_snippets/integrations/snowflake/resource_tutorial/full_example.py lines=1,4-16,27-58,67-80,86-88 import pandas as pd from dagster_snowflake import SnowflakeResource from snowflake.connector.pandas_tools import write_pandas diff --git a/docs/docs-beta/docs/integrations/libraries/tableau.md b/docs/docs-beta/docs/integrations/libraries/tableau.md index 0ef72f8266b90..b4b986044d2fb 100644 --- a/docs/docs-beta/docs/integrations/libraries/tableau.md +++ b/docs/docs-beta/docs/integrations/libraries/tableau.md @@ -68,50 +68,15 @@ Dagster can automatically load all data sources, sheets, and dashboards from you Use to interact with your Tableau Cloud workspace: -{/* TODO convert to */} -```python file=/integrations/tableau/representing-tableau-cloud-assets.py -from dagster_tableau import TableauCloudWorkspace, load_tableau_asset_specs - -import dagster as dg - -# Connect to Tableau Cloud using the connected app credentials -tableau_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("TABLEAU_POD_NAME"), -) - -tableau_specs = load_tableau_asset_specs(tableau_workspace) -defs = dg.Definitions(assets=[*tableau_specs], resources={"tableau": tableau_workspace}) -``` + Use to interact with your Tableau Server workspace: -{/* TODO convert to */} -```python file=/integrations/tableau/representing-tableau-server-assets.py -from dagster_tableau import TableauServerWorkspace, load_tableau_asset_specs - -import dagster as dg -# Connect to Tableau Server using the connected app credentials -tableau_workspace = TableauServerWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - server_name=dg.EnvVar("TABLEAU_SERVER_NAME"), -) - -tableau_specs = load_tableau_asset_specs(tableau_workspace) -defs = dg.Definitions(assets=[*tableau_specs], resources={"tableau": tableau_workspace}) -``` + @@ -120,52 +85,7 @@ defs = dg.Definitions(assets=[*tableau_specs], resources={"tableau": tableau_wor By default, Dagster will generate asset specs for each Tableau asset based on its type, and populate default metadata. You can further customize asset properties by passing a custom subclass to the function. This subclass can implement methods to customize the asset specs for each Tableau asset type. -{/* TODO convert to */} -```python file=/integrations/tableau/customize-tableau-asset-defs.py -from dagster_tableau import ( - DagsterTableauTranslator, - TableauCloudWorkspace, - load_tableau_asset_specs, -) -from dagster_tableau.translator import TableauContentType, TableauTranslatorData - -import dagster as dg - -tableau_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("TABLEAU_POD_NAME"), -) - - -# A translator class lets us customize properties of the built -# Tableau assets, such as the owners or asset key -class MyCustomTableauTranslator(DagsterTableauTranslator): - def get_asset_spec(self, data: TableauTranslatorData) -> dg.AssetSpec: - # We create the default asset spec using super() - default_spec = super().get_asset_spec(data) - # We customize the metadata and asset key prefix for all assets, including sheets, - # and we customize the team owner tag only for sheets. - return default_spec.replace_attributes( - key=default_spec.key.with_prefix("prefix"), - metadata={**default_spec.metadata, "custom": "metadata"}, - owners=( - ["team:my_team"] - if data.content_type == TableauContentType.SHEET - else ... - ), - ) - - -tableau_specs = load_tableau_asset_specs( - tableau_workspace, - dagster_tableau_translator=MyCustomTableauTranslator(), -) -defs = dg.Definitions(assets=[*tableau_specs], resources={"tableau": tableau_workspace}) -``` + Note that `super()` is called in each of the overridden methods to generate the default asset spec. It is best practice to generate the default asset spec before customizing it. @@ -173,91 +93,14 @@ Note that `super()` is called in each of the overridden methods to generate the Definitions from multiple Tableau workspaces can be combined by instantiating multiple Tableau resources and merging their specs. This lets you view all your Tableau assets in a single asset graph: -{/* TODO convert to */} -```python file=/integrations/tableau/multiple-tableau-workspaces.py + from dagster_tableau import TableauCloudWorkspace, load_tableau_asset_specs -import dagster as dg - -sales_team_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("SALES_TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("SALES_TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("SALES_TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("SALES_TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("SALES_TABLEAU_POD_NAME"), -) - -marketing_team_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("MARKETING_TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("MARKETING_TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar( - "MARKETING_TABLEAU_CONNECTED_APP_SECRET_VALUE" - ), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("MARKETING_TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("MARKETING_TABLEAU_POD_NAME"), -) - - -sales_team_specs = load_tableau_asset_specs(sales_team_workspace) -marketing_team_specs = load_tableau_asset_specs(marketing_team_workspace) - -defs = dg.Definitions( - assets=[*sales_team_specs, *marketing_team_specs], - resources={ - "marketing_tableau": marketing_team_workspace, - "sales_tableau": sales_team_workspace, - }, -) -``` - ### Refresh and materialize Tableau assets You can use Dagster to refresh Tableau workbooks and materialize Tableau sheets and dashboards. -{/* TODO convert to */} -```python file=/integrations/tableau/refresh-and-materialize-tableau-assets.py -from dagster_tableau import ( - TableauCloudWorkspace, - build_tableau_materializable_assets_definition, - load_tableau_asset_specs, - parse_tableau_external_and_materializable_asset_specs, -) - -import dagster as dg - -tableau_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("TABLEAU_POD_NAME"), -) - -# Load Tableau asset specs -tableau_specs = load_tableau_asset_specs( - workspace=tableau_workspace, -) - -external_asset_specs, materializable_asset_specs = ( - parse_tableau_external_and_materializable_asset_specs(tableau_specs) -) - -# Use the asset definition builder to construct the definition for tableau materializable assets -defs = dg.Definitions( - assets=[ - build_tableau_materializable_assets_definition( - resource_key="tableau", - specs=materializable_asset_specs, - refreshable_workbook_ids=["b75fc023-a7ca-4115-857b-4342028640d0"], - ), - *external_asset_specs, - ], - resources={"tableau": tableau_workspace}, -) -``` + Note that only workbooks created with extracts can be refreshed using this method. See more about [refreshing data sources](https://help.tableau.com/current/pro/desktop/en-us/refreshing_data.htm) in Tableau documentation website. @@ -265,142 +108,16 @@ Note that only workbooks created with extracts can be refreshed using this metho When an upstream dependency of a Tableau asset fails to materialize or to pass the asset checks, it is possible to add a [Data Quality Warning](https://help.tableau.com/current/online/en-us/dm_dqw.htm) to the corresponding data source in Tableau. This can be achieved by leveraging the `add_data_quality_warning_to_data_source` in a sensor. -{/* TODO convert to */} -```python file=/integrations/tableau/add-tableau-data-quality-warning.py -from dagster_tableau import ( - TableauCloudWorkspace, - build_tableau_materializable_assets_definition, - load_tableau_asset_specs, - parse_tableau_external_and_materializable_asset_specs, -) - -import dagster as dg - -# Connect to Tableau Cloud using the connected app credentials -tableau_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("TABLEAU_POD_NAME"), -) - - -@dg.asset( - # Define which Tableau data source this upstream asset corresponds to - metadata={"dagster/tableau_data_source_id": "f5660c7-2b05-4ff0-90ce-3199226956c6"} -) -def upstream_asset(): ... - - -@dg.run_failure_sensor -def tableau_run_failure_sensor( - context: dg.RunFailureSensorContext, tableau: TableauCloudWorkspace -): - asset_keys = context.dagster_run.asset_selection or set() - for asset_key in asset_keys: - data_source_id = upstream_asset.metadata_by_key.get(asset_key, {}).get( - "dagster/tableau_data_source_id" - ) - if data_source_id: - with tableau.get_client() as client: - client.add_data_quality_warning_to_data_source( - data_source_id=data_source_id, message=context.failure_event.message - ) - - -tableau_specs = load_tableau_asset_specs( - workspace=tableau_workspace, -) - -external_asset_specs, materializable_asset_specs = ( - parse_tableau_external_and_materializable_asset_specs(tableau_specs) -) - -# Pass the sensor, Tableau resource, upstream asset, Tableau assets specs and materializable assets definition at once -defs = dg.Definitions( - assets=[ - upstream_asset, - build_tableau_materializable_assets_definition( - resource_key="tableau", - specs=materializable_asset_specs, - refreshable_workbook_ids=["b75fc023-a7ca-4115-857b-4342028640d0"], - ), - *external_asset_specs, - ], - sensors=[tableau_run_failure_sensor], - resources={"tableau": tableau_workspace}, -) -``` + + ### Customizing how Tableau assets are materialized Instead of using the out-of-the-box utility, you can build your own assets definition that trigger the refresh of your Tableau workbooks. This allows you to customize how the refresh is triggered or to run custom code before or after the refresh. -{/* TODO convert to */} -```python file=/integrations/tableau/materialize-tableau-assets-advanced.py + from collections.abc import Sequence -from dagster_tableau import ( - TableauCloudWorkspace, - load_tableau_asset_specs, - parse_tableau_external_and_materializable_asset_specs, -) - -import dagster as dg - -tableau_workspace = TableauCloudWorkspace( - connected_app_client_id=dg.EnvVar("TABLEAU_CONNECTED_APP_CLIENT_ID"), - connected_app_secret_id=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_ID"), - connected_app_secret_value=dg.EnvVar("TABLEAU_CONNECTED_APP_SECRET_VALUE"), - username=dg.EnvVar("TABLEAU_USERNAME"), - site_name=dg.EnvVar("TABLEAU_SITE_NAME"), - pod_name=dg.EnvVar("TABLEAU_POD_NAME"), -) - - -# Assets definition factory which triggers workbooks refresh and sends a notification once complete -def build_tableau_materialize_and_notify_asset_def( - specs: Sequence[dg.AssetSpec], refreshable_workbook_ids: Sequence[str] -) -> dg.AssetsDefinition: - @dg.multi_asset( - name="tableau_sync", - compute_kind="tableau", - specs=specs, - ) - def asset_fn(context: dg.AssetExecutionContext, tableau: TableauCloudWorkspace): - with tableau.get_client() as client: - yield from client.refresh_and_materialize_workbooks( - specs=specs, refreshable_workbook_ids=refreshable_workbook_ids - ) - # Do some custom work after refreshing here, such as sending an email notification - - return asset_fn - - -# Load Tableau asset specs -tableau_specs = load_tableau_asset_specs( - workspace=tableau_workspace, -) - -external_asset_specs, materializable_asset_specs = ( - parse_tableau_external_and_materializable_asset_specs(tableau_specs) -) - -# Use the asset definition builder to construct the definition for tableau materializable assets -defs = dg.Definitions( - assets=[ - build_tableau_materialize_and_notify_asset_def( - specs=materializable_asset_specs, - refreshable_workbook_ids=["b75fc023-a7ca-4115-857b-4342028640d0"], - ), - *external_asset_specs, - ], - resources={"tableau": tableau_workspace}, -) -``` - ### Related - [`dagster-tableau` API reference](/api/python-api/libraries/dagster-tableau) From 00495d19cfec03774def7b53ed777aedb81bd76c Mon Sep 17 00:00:00 2001 From: nikki everett Date: Wed, 5 Feb 2025 23:28:14 -0700 Subject: [PATCH 3/9] convert automate docs Signed-off-by: nikki everett --- .../example-customizations.md | 72 ++--------- .../schedules/configuring-job-behavior.md | 24 +--- ...hedules-for-partitioned-assets-and-jobs.md | 117 ++---------------- .../customizing-execution-timezone.md | 16 +-- .../automate/schedules/defining-schedules.md | 17 +-- .../automate/schedules/testing-schedules.md | 98 +-------------- .../schedules/using-resources-in-schedules.md | 43 +------ .../automate/sensors/logging-in-sensors.md | 8 +- .../automate/sensors/run-status-sensors.md | 42 +------ .../sensors/testing-run-status-sensors.md | 97 ++------------- .../automate/sensors/testing-sensors.md | 67 +--------- .../sensors/using-resources-in-sensors.md | 48 +------ 12 files changed, 45 insertions(+), 604 deletions(-) diff --git a/docs/docs-beta/docs/guides/automate/declarative-automation/customizing-automation-conditions/example-customizations.md b/docs/docs-beta/docs/guides/automate/declarative-automation/customizing-automation-conditions/example-customizations.md index 1af64f0ae5978..247cdb3985f35 100644 --- a/docs/docs-beta/docs/guides/automate/declarative-automation/customizing-automation-conditions/example-customizations.md +++ b/docs/docs-beta/docs/guides/automate/declarative-automation/customizing-automation-conditions/example-customizations.md @@ -9,16 +9,7 @@ By default, `AutomationCondition.eager()` will not materialize a target if it ha If it is expected to have missing upstream data, remove `~AutomationCondition.any_deps_missing()` from the eager policy to allow execution: -{/* TODO convert to */} -```python file=concepts/declarative_automation/allow_missing_upstreams.py -import dagster as dg - -condition = ( - dg.AutomationCondition.eager() - .without(~dg.AutomationCondition.missing()) - .with_label("eager_allow_missing") -) -``` + ## Updating older time partitions @@ -28,14 +19,7 @@ By default, `AutomationCondition.eager()` will only update the latest time parti If updates to historical partitions should result in downstream updates, then this sub-condition can be removed: -{/* TODO convert to */} -```python file=concepts/declarative_automation/update_older_time_partitions.py -from dagster import AutomationCondition - -condition = AutomationCondition.eager().without( - AutomationCondition.in_latest_time_window(), -) -``` + ### Updating older time partitions with AutomationCondition.on_cron() @@ -43,57 +27,23 @@ By default, `AutomationCondition.on_cron()` will target the latest time partitio If you instead want to update partitions on a delay, then you can replace this condition with one that targets a partition that has a specific lag from the latest time window: -{/* TODO convert to */} -```python file=concepts/declarative_automation/update_specific_older_partition.py -from datetime import timedelta - -from dagster import AutomationCondition - -five_days_ago_condition = AutomationCondition.in_latest_time_window( - timedelta(days=5) -) & ~AutomationCondition.in_latest_time_window(timedelta(days=4)) - -condition = AutomationCondition.eager().replace( - "in_latest_time_window", five_days_ago_condition -) -``` + ## Ignoring dependencies when using AutomationCondition.on_cron() By default, `AutomationCondition.on_cron()` will wait for all upstream dependencies to be updated before executing the asset it's attached to. In some cases, it can be useful to ignore some upstream dependencies in this calculation. This can be done by passing in an to be ignored: -{/* TODO convert to */} -```python file=concepts/declarative_automation/ignore_dependencies_cron.py -import dagster as dg - -condition = dg.AutomationCondition.on_cron("@hourly").ignore( - dg.AssetSelection.assets("foo") -) -``` + Alternatively, you can pass in an to be allowed: -{/* TODO convert to */} -```python file=concepts/declarative_automation/allow_dependencies_cron.py -import dagster as dg - -condition = dg.AutomationCondition.on_cron("@hourly").allow( - dg.AssetSelection.groups("abc") -) -``` + ### Wait for all blocking asset checks to complete before executing The `AutomationCondition.all_deps_blocking_checks_passed()` condition becomes true after all upstream blocking checks have passed. This can be combined with built-in conditions such as `AutomationCondition.on_cron()` and `AutomationCondition.eager()` to ensure that your asset does not execute if upstream data is in a bad state: -```python file=concepts/declarative_automation/blocking_checks_condition.py -import dagster as dg - -condition = ( - dg.AutomationCondition.eager() - & dg.AutomationCondition.all_deps_blocking_checks_passed() -) -``` + ## Waiting for all blocking asset checks to complete before executing @@ -101,12 +51,4 @@ The `AutomationCondition.all_deps_blocking_checks_passed()` condition becomes tr This can be combined with built-in conditions such as `AutomationCondition.on_cron()` and `AutomationCondition.eager()` to ensure that your asset does not execute if upstream data is in a bad state: -{/* TODO convert to */} -```python file=concepts/declarative_automation/blocking_checks_condition.py -import dagster as dg - -condition = ( - dg.AutomationCondition.eager() - & dg.AutomationCondition.all_deps_blocking_checks_passed() -) -``` + diff --git a/docs/docs-beta/docs/guides/automate/schedules/configuring-job-behavior.md b/docs/docs-beta/docs/guides/automate/schedules/configuring-job-behavior.md index afe907a3c8141..b27e2b569a00c 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/configuring-job-behavior.md +++ b/docs/docs-beta/docs/guides/automate/schedules/configuring-job-behavior.md @@ -5,29 +5,7 @@ sidebar_position: 200 This example demonstrates how to use run config to vary the behavior of a job based on its scheduled run time. -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/schedules/schedules.py startafter=start_run_config_schedule endbefore=end_run_config_schedule -@op(config_schema={"scheduled_date": str}) -def configurable_op(context: OpExecutionContext): - context.log.info(context.op_config["scheduled_date"]) - - -@job -def configurable_job(): - configurable_op() - - -@schedule(job=configurable_job, cron_schedule="0 0 * * *") -def configurable_job_schedule(context: ScheduleEvaluationContext): - scheduled_date = context.scheduled_execution_time.strftime("%Y-%m-%d") - return RunRequest( - run_key=None, - run_config={ - "ops": {"configurable_op": {"config": {"scheduled_date": scheduled_date}}} - }, - tags={"date": scheduled_date}, - ) -``` + ## APIs in this example diff --git a/docs/docs-beta/docs/guides/automate/schedules/constructing-schedules-for-partitioned-assets-and-jobs.md b/docs/docs-beta/docs/guides/automate/schedules/constructing-schedules-for-partitioned-assets-and-jobs.md index 3048e6c93828d..baaf684007b09 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/constructing-schedules-for-partitioned-assets-and-jobs.md +++ b/docs/docs-beta/docs/guides/automate/schedules/constructing-schedules-for-partitioned-assets-and-jobs.md @@ -35,29 +35,7 @@ Refer to the following tabs for examples of asset and op-based jobs using . In this example, we created an asset job named `partitioned_job` and then constructed `asset_partitioned_schedule` by using : -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/schedule_from_partitions.py startafter=start_partitioned_asset_schedule endbefore=end_partitioned_asset_schedule -from dagster import ( - asset, - build_schedule_from_partitioned_job, - define_asset_job, - DailyPartitionsDefinition, -) - -daily_partition = DailyPartitionsDefinition(start_date="2024-05-20") - - -@asset(partitions_def=daily_partition) -def daily_asset(): ... - - -partitioned_asset_job = define_asset_job("partitioned_job", selection=[daily_asset]) - - -asset_partitioned_schedule = build_schedule_from_partitioned_job( - partitioned_asset_job, -) -``` + @@ -66,19 +44,7 @@ asset_partitioned_schedule = build_schedule_from_partitioned_job( Op jobs are defined using the . In this example, we created a partitioned job named `partitioned_op_job` and then constructed `partitioned_op_schedule` using : -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/schedule_from_partitions.py startafter=start_marker endbefore=end_marker -from dagster import build_schedule_from_partitioned_job, job - - -@job(config=partitioned_config) -def partitioned_op_job(): ... - - -partitioned_op_schedule = build_schedule_from_partitioned_job( - partitioned_op_job, -) -``` + @@ -89,14 +55,7 @@ The `minute_of_hour`, `hour_of_day`, `day_of_week`, and `day_of_month` parameter Consider the following job: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/schedule_from_partitions.py startafter=start_partitioned_schedule_with_offset endbefore=end_partitioned_schedule_with_offset -from dagster import build_schedule_from_partitioned_job - -asset_partitioned_schedule = build_schedule_from_partitioned_job( - partitioned_asset_job, hour_of_day=1, minute_of_hour=30 -) -``` + On May 20, 2024, the schedule will evaluate at 1:30 AM UTC and then start a run for the partition key of the previous day, `2024-05-19`. @@ -127,14 +86,7 @@ After `2024-05-20 23:59:59` passes, the time window is complete and Dagster will If you need to customize the ending, or most recent partition in a set, use the `end_offset` parameter in the partition's config: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/schedule_from_partitions.py startafter=start_offset_partition endbefore=end_offset_partition -from dagster import DailyPartitionsDefinition - -daily_partition_with_offset = DailyPartitionsDefinition( - start_date="2024-05-20", end_offset=-1 -) -``` + Setting this parameter changes the partition that will be filled in at each schedule tick. Positive and negative integers are accepted, which will have the following effects: @@ -161,70 +113,15 @@ Next, we'll demonstrate how to create a schedule for a job with a static partiti In this example, the job is partitioned by continent: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/static_partitioned_asset_job.py startafter=start_job endbefore=end_job -from dagster import ( - AssetExecutionContext, - Config, - asset, - define_asset_job, - static_partitioned_config, -) - -CONTINENTS = [ - "Africa", - "Antarctica", - "Asia", - "Europe", - "North America", - "Oceania", - "South America", -] - - -@static_partitioned_config(partition_keys=CONTINENTS) -def continent_config(partition_key: str): - return {"ops": {"continents": {"config": {"continent_name": partition_key}}}} - - -class ContinentOpConfig(Config): - continent_name: str - - -@asset -def continents(context: AssetExecutionContext, config: ContinentOpConfig): - context.log.info(config.continent_name) - - -continent_job = define_asset_job( - name="continent_job", selection=[continents], config=continent_config -) -``` + Using the decorator, we'll write a schedule that targets each partition, or `continent`: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/static_partitioned_asset_job.py startafter=start_schedule_all_partitions endbefore=end_schedule_all_partitions -from dagster import RunRequest, schedule - - -@schedule(cron_schedule="0 0 * * *", job=continent_job) -def continent_schedule(): - for c in CONTINENTS: - yield RunRequest(run_key=c, partition_key=c) -``` + If we only want to target the `Antarctica` partition, we can create a schedule like the following: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/static_partitioned_asset_job.py startafter=start_single_partition endbefore=end_single_partition -from dagster import RunRequest, schedule - - -@schedule(cron_schedule="0 0 * * *", job=continent_job) -def antarctica_schedule(): - return RunRequest(partition_key="Antarctica") -``` + ## APIs in this guide diff --git a/docs/docs-beta/docs/guides/automate/schedules/customizing-execution-timezone.md b/docs/docs-beta/docs/guides/automate/schedules/customizing-execution-timezone.md index 0a889957c5022..5c0fcc2811f9a 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/customizing-execution-timezone.md +++ b/docs/docs-beta/docs/guides/automate/schedules/customizing-execution-timezone.md @@ -29,12 +29,7 @@ Using the `execution_timezone` parameter allows you to specify a timezone for th This parameter accepts any [`tz` timezone](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones). For example, the following schedule will execute **every day at 9:00 AM in US Pacific time (America/Los_Angeles)**: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/schedules/schedules.py startafter=start_timezone endbefore=end_timezone -my_timezone_schedule = ScheduleDefinition( - job=my_job, cron_schedule="0 9 * * *", execution_timezone="America/Los_Angeles" -) -``` + ## Setting timezones on partitioned jobs @@ -42,14 +37,7 @@ Schedules constructed from partitioned jobs execute in the timezone defined on t For example, the following partition uses the **US Pacific (America/Los_Angeles)** timezone: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/partition_with_timezone.py -from dagster import DailyPartitionsDefinition - -daily_partition = DailyPartitionsDefinition( - start_date="2024-05-20", timezone="America/Los_Angeles" -) -``` + ## Execution times and Daylight Savings Time diff --git a/docs/docs-beta/docs/guides/automate/schedules/defining-schedules.md b/docs/docs-beta/docs/guides/automate/schedules/defining-schedules.md index d0f73b4ba0e75..cf903a2007a6f 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/defining-schedules.md +++ b/docs/docs-beta/docs/guides/automate/schedules/defining-schedules.md @@ -12,14 +12,7 @@ The following examples demonstrate how to define some basic schedules. This example demonstrates how to define a schedule using that will run a job every day at midnight. While this example uses op jobs, the same approach will work with [asset jobs](/guides/build/assets/asset-jobs). -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/schedules/schedules.py startafter=start_basic_schedule endbefore=end_basic_schedule -@job -def my_job(): ... - - -basic_schedule = ScheduleDefinition(job=my_job, cron_schedule="0 0 * * *") -``` + :::note @@ -53,13 +46,7 @@ The `cron_schedule` argument accepts standard [cron expressions](https://en.wiki This example demonstrates how to emit log messages from a schedule during its evaluation function. These logs will be visible in the UI when you inspect a tick in the schedule's tick history. -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/schedules/schedules.py startafter=start_schedule_logging endbefore=end_schedule_logging -@schedule(job=my_job, cron_schedule="* * * * *") -def logs_then_skips(context): - context.log.info("Logging from a schedule!") - return SkipReason("Nothing to do") -``` + :::note diff --git a/docs/docs-beta/docs/guides/automate/schedules/testing-schedules.md b/docs/docs-beta/docs/guides/automate/schedules/testing-schedules.md index 6a44a772e176b..28a38f08aabc9 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/testing-schedules.md +++ b/docs/docs-beta/docs/guides/automate/schedules/testing-schedules.md @@ -38,44 +38,11 @@ To test a function decorated by the */} -```python file=concepts/partitions_schedules_sensors/schedules/schedules.py startafter=start_run_config_schedule endbefore=end_run_config_schedule -@op(config_schema={"scheduled_date": str}) -def configurable_op(context: OpExecutionContext): - context.log.info(context.op_config["scheduled_date"]) - - -@job -def configurable_job(): - configurable_op() - - -@schedule(job=configurable_job, cron_schedule="0 0 * * *") -def configurable_job_schedule(context: ScheduleEvaluationContext): - scheduled_date = context.scheduled_execution_time.strftime("%Y-%m-%d") - return RunRequest( - run_key=None, - run_config={ - "ops": {"configurable_op": {"config": {"scheduled_date": scheduled_date}}} - }, - tags={"date": scheduled_date}, - ) -``` + To test this schedule, we used to construct a to provide to the `context` parameter: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/schedules/schedule_examples.py startafter=start_test_cron_schedule_context endbefore=end_test_cron_schedule_context -from dagster import build_schedule_context, validate_run_config - - -def test_configurable_job_schedule(): - context = build_schedule_context( - scheduled_execution_time=datetime.datetime(2020, 1, 1) - ) - run_request = configurable_job_schedule(context) - assert validate_run_config(configurable_job, run_request.run_config) -``` + If your -decorated function doesn't have a context parameter, you don't need to provide one when invoking it. @@ -85,66 +52,13 @@ For schedules that utilize [resources](/guides/build/external-resources), you ca Let's say we want to test the `process_data_schedule` in this example: -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_on_schedule endbefore=end_new_resource_on_schedule dedent=4 -from dagster import ( - schedule, - ScheduleEvaluationContext, - ConfigurableResource, - job, - RunRequest, - RunConfig, - Definitions, -) -from datetime import datetime -from typing import List - -class DateFormatter(ConfigurableResource): - format: str - - def strftime(self, dt: datetime) -> str: - return dt.strftime(self.format) - -@job -def process_data(): ... - -@schedule(job=process_data, cron_schedule="* * * * *") -def process_data_schedule( - context: ScheduleEvaluationContext, - date_formatter: DateFormatter, -): - formatted_date = date_formatter.strftime(context.scheduled_execution_time) - - return RunRequest( - run_key=None, - tags={"date": formatted_date}, - ) - -defs = Definitions( - jobs=[process_data], - schedules=[process_data_schedule], - resources={"date_formatter": DateFormatter(format="%Y-%m-%d")}, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + In the test for this schedule, we provided the `date_formatter` resource to the schedule when we invoked its function: -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_test_resource_on_schedule endbefore=end_test_resource_on_schedule dedent=4 -from dagster import build_schedule_context, validate_run_config - -def test_process_data_schedule(): - context = build_schedule_context( - scheduled_execution_time=datetime.datetime(2020, 1, 1) - ) - run_request = process_data_schedule( - context, date_formatter=DateFormatter(format="%Y-%m-%d") - ) - assert ( - run_request.run_config["ops"]["fetch_data"]["config"]["date"] - == "2020-01-01" - ) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + ## APIs in this guide diff --git a/docs/docs-beta/docs/guides/automate/schedules/using-resources-in-schedules.md b/docs/docs-beta/docs/guides/automate/schedules/using-resources-in-schedules.md index 3f9573d1d7565..5f6fb2d48b36c 100644 --- a/docs/docs-beta/docs/guides/automate/schedules/using-resources-in-schedules.md +++ b/docs/docs-beta/docs/guides/automate/schedules/using-resources-in-schedules.md @@ -13,47 +13,8 @@ All Dagster definitions, including schedules and resources, must be attached to ::: -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_on_schedule endbefore=end_new_resource_on_schedule dedent=4 -from dagster import ( - schedule, - ScheduleEvaluationContext, - ConfigurableResource, - job, - RunRequest, - RunConfig, - Definitions, -) -from datetime import datetime -from typing import List - -class DateFormatter(ConfigurableResource): - format: str - - def strftime(self, dt: datetime) -> str: - return dt.strftime(self.format) - -@job -def process_data(): ... - -@schedule(job=process_data, cron_schedule="* * * * *") -def process_data_schedule( - context: ScheduleEvaluationContext, - date_formatter: DateFormatter, -): - formatted_date = date_formatter.strftime(context.scheduled_execution_time) - - return RunRequest( - run_key=None, - tags={"date": formatted_date}, - ) - -defs = Definitions( - jobs=[process_data], - schedules=[process_data_schedule], - resources={"date_formatter": DateFormatter(format="%Y-%m-%d")}, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + ## APIs in this guide diff --git a/docs/docs-beta/docs/guides/automate/sensors/logging-in-sensors.md b/docs/docs-beta/docs/guides/automate/sensors/logging-in-sensors.md index 142e38bc42181..f60e118ac74ee 100644 --- a/docs/docs-beta/docs/guides/automate/sensors/logging-in-sensors.md +++ b/docs/docs-beta/docs/guides/automate/sensors/logging-in-sensors.md @@ -5,13 +5,7 @@ sidebar_position: 200 Any sensor can emit log messages during its evaluation function: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/sensors/sensors.py startafter=start_sensor_logging endbefore=end_sensor_logging -@sensor(target=the_job) -def logs_then_skips(context): - context.log.info("Logging from a sensor!") - return SkipReason("Nothing to do") -``` + These logs can be viewed when inspecting a tick in the tick history view on the corresponding sensor page. diff --git a/docs/docs-beta/docs/guides/automate/sensors/run-status-sensors.md b/docs/docs-beta/docs/guides/automate/sensors/run-status-sensors.md index 10c85cfdba65f..d7c119f596982 100644 --- a/docs/docs-beta/docs/guides/automate/sensors/run-status-sensors.md +++ b/docs/docs-beta/docs/guides/automate/sensors/run-status-sensors.md @@ -7,24 +7,7 @@ If you want to act on the status of a run, Dagster provides a way to create a se Here is an example of a run status sensor that launches a run of `status_reporting_job` if a run is successful: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/sensors/run_status_run_requests.py startafter=start endbefore=end -@run_status_sensor( - run_status=DagsterRunStatus.SUCCESS, - request_job=status_reporting_job, -) -def report_status_sensor(context): - # this condition prevents the sensor from triggering status_reporting_job again after it succeeds - if context.dagster_run.job_name != status_reporting_job.name: - run_config = { - "ops": { - "status_report": {"config": {"job_name": context.dagster_run.job_name}} - } - } - return RunRequest(run_key=None, run_config=run_config) - else: - return SkipReason("Don't report status of status_reporting_job") -``` + `request_job` is the job that will be run when the `RunRequest` is returned. @@ -32,29 +15,10 @@ Note that in `report_status_sensor` we conditionally return a `RunRequest`. This Here is an example of a sensor that reports job success in a Slack message: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_success_sensor_marker endbefore=end_success_sensor_marker -from dagster import run_status_sensor, RunStatusSensorContext, DagsterRunStatus - - -@run_status_sensor(run_status=DagsterRunStatus.SUCCESS) -def my_slack_on_run_success(context: RunStatusSensorContext): - slack_client = WebClient(token=os.environ["SLACK_DAGSTER_ETL_BOT_TOKEN"]) - - slack_client.chat_postMessage( - channel="#alert-channel", - text=f'Job "{context.dagster_run.job_name}" succeeded.', - ) -``` + When a run status sensor is triggered by a run but doesn't return anything, Dagster will report an event back to the run to indicate that the sensor ran. Once you have written your sensor, you can add the sensor to a object so it can be enabled and used the same as other sensors: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_definitions_marker endbefore=end_definitions_marker -from dagster import Definitions - - -defs = Definitions(jobs=[my_sensor_job], sensors=[my_slack_on_run_success]) -``` + diff --git a/docs/docs-beta/docs/guides/automate/sensors/testing-run-status-sensors.md b/docs/docs-beta/docs/guides/automate/sensors/testing-run-status-sensors.md index b335099f84168..03b2aad24d6ae 100644 --- a/docs/docs-beta/docs/guides/automate/sensors/testing-run-status-sensors.md +++ b/docs/docs-beta/docs/guides/automate/sensors/testing-run-status-sensors.md @@ -7,53 +7,16 @@ As with other sensors, you can directly invoke run status sensors. However, the If you had written a status sensor like this (assuming you implemented the function `email_alert` elsewhere): -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_simple_success_sensor endbefore=end_simple_success_sensor -@run_status_sensor(run_status=DagsterRunStatus.SUCCESS) -def my_email_sensor(context: RunStatusSensorContext): - message = f'Job "{context.dagster_run.job_name}" succeeded.' - email_alert(message) -``` -We can first write a simple job that will succeed: - -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_run_status_sensor_testing_with_context_setup endbefore=end_run_status_sensor_testing_with_context_setup -@op -def succeeds(): - return 1 + +We can first write a simple job that will succeed: -@job -def my_job_succeeds(): - succeeds() -``` + Then we can execute this job and pull the attributes we need to build the `context`. We provide a function that will return the correct context object: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_run_status_sensor_testing_marker endbefore=end_run_status_sensor_testing_marker -# execute the job -instance = DagsterInstance.ephemeral() -result = my_job_succeeds.execute_in_process(instance=instance) - -# retrieve the DagsterRun -dagster_run = result.dagster_run - -# retrieve a success event from the completed execution -dagster_event = result.get_job_success_event() - -# create the context -run_status_sensor_context = build_run_status_sensor_context( - sensor_name="my_email_sensor", - dagster_instance=instance, - dagster_run=dagster_run, - dagster_event=dagster_event, -) - -# run the sensor -my_email_sensor(run_status_sensor_context) -``` + {/* TODO the methods and statuses below do not exist in API docs We have provided convenience functions and for retrieving `DagsterRunStatus.SUCCESS` and `DagsterRunStatus.FAILURE` events, respectively. If you have a run status sensor triggered on another status, you can retrieve all events from `result` and filter based on your event type. @@ -61,60 +24,14 @@ We have provided convenience functions . If we wanted to test this run failure sensor: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_simple_fail_sensor endbefore=end_simple_fail_sensor -@run_failure_sensor -def my_email_failure_sensor(context: RunFailureSensorContext): - message = ( - f'Job "{context.dagster_run.job_name}" failed. Error:' - f" {context.failure_event.message}" - ) - email_alert(message) -``` + We first need to make a simple job that will fail: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_failure_sensor_testing_with_context_setup endbefore=end_failure_sensor_testing_with_context_setup -from dagster import op, job - - -@op -def fails(): - raise Exception("failure!") - - -@job -def my_job_fails(): - fails() -``` + Then we can execute the job and create our context: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/sensors/sensor_alert.py startafter=start_alert_sensor_testing_with_context_marker endbefore=end_alert_sensor_testing_with_context_marker -from dagster import DagsterInstance, build_run_status_sensor_context - -# execute the job -instance = DagsterInstance.ephemeral() -result = my_job_fails.execute_in_process(instance=instance, raise_on_error=False) - -# retrieve the DagsterRun -dagster_run = result.dagster_run - -# retrieve a failure event from the completed job execution -dagster_event = result.get_job_failure_event() - -# create the context -run_failure_sensor_context = build_run_status_sensor_context( - sensor_name="my_email_failure_sensor", - dagster_instance=instance, - dagster_run=dagster_run, - dagster_event=dagster_event, -).for_run_failure() - -# run the sensor -my_email_failure_sensor(run_failure_sensor_context) -``` + Note the additional function call after creating the `context`. The `context` provided by is a subclass of the context provided by and can be built using this additional call. diff --git a/docs/docs-beta/docs/guides/automate/sensors/testing-sensors.md b/docs/docs-beta/docs/guides/automate/sensors/testing-sensors.md index 6b43387d7c563..d2c6a1c5508de 100644 --- a/docs/docs-beta/docs/guides/automate/sensors/testing-sensors.md +++ b/docs/docs-beta/docs/guides/automate/sensors/testing-sensors.md @@ -52,62 +52,16 @@ dagster sensor preview my_sensor_name To unit test sensors, you can directly invoke the sensor's Python function. This will return all the run requests yielded by the sensor. The config obtained from the returned run requests can be validated using the function: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/sensors/sensors.py startafter=start_sensor_testing endbefore=end_sensor_testing -from dagster import validate_run_config - -@sensor(target=log_file_job) -def sensor_to_test(): - yield RunRequest( - run_key="foo", - run_config={"ops": {"process_file": {"config": {"filename": "foo"}}}}, - ) - - -def test_sensor(): - for run_request in sensor_to_test(): - assert validate_run_config(log_file_job, run_request.run_config) -``` + Notice that since the context argument wasn't used in the sensor, a context object doesn't have to be provided. However, if the context object **is** needed, it can be provided via . Consider again the `my_directory_sensor_cursor` example: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/sensors/sensors.py startafter=start_cursor_sensors_marker endbefore=end_cursor_sensors_marker -@sensor(target=log_file_job) -def my_directory_sensor_cursor(context): - last_mtime = float(context.cursor) if context.cursor else 0 - - max_mtime = last_mtime - for filename in os.listdir(MY_DIRECTORY): - filepath = os.path.join(MY_DIRECTORY, filename) - if os.path.isfile(filepath): - fstats = os.stat(filepath) - file_mtime = fstats.st_mtime - if file_mtime <= last_mtime: - continue - - # the run key should include mtime if we want to kick off new runs based on file modifications - run_key = f"{filename}:{file_mtime}" - run_config = {"ops": {"process_file": {"config": {"filename": filename}}}} - yield RunRequest(run_key=run_key, run_config=run_config) - max_mtime = max(max_mtime, file_mtime) - - context.update_cursor(str(max_mtime)) -``` + This sensor uses the `context` argument. To invoke it, we need to provide one: -{/* TODO convert to */} -```python file=concepts/partitions_schedules_sensors/sensors/sensors.py startafter=start_sensor_testing_with_context endbefore=end_sensor_testing_with_context -from dagster import build_sensor_context - - -def test_my_directory_sensor_cursor(): - context = build_sensor_context(cursor="0") - for run_request in my_directory_sensor_cursor(context): - assert validate_run_config(log_file_job, run_request.run_config) -``` + **Testing sensors with resources** @@ -115,19 +69,8 @@ For sensors which utilize [resources](/guides/build/external-resources/), you ca Below is a test for the `process_new_users_sensor` that we defined in "[Using resources in sensors](using-resources-in-sensors)", which uses the `users_api` resource. -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_test_resource_on_sensor endbefore=end_test_resource_on_sensor dedent=4 -from dagster import build_sensor_context, validate_run_config - -def test_process_new_users_sensor(): - class FakeUsersAPI: - def fetch_users(self) -> list[str]: - return ["1", "2", "3"] - - context = build_sensor_context() - run_requests = process_new_users_sensor(context, users_api=FakeUsersAPI()) - assert len(run_requests) == 3 -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + \ No newline at end of file diff --git a/docs/docs-beta/docs/guides/automate/sensors/using-resources-in-sensors.md b/docs/docs-beta/docs/guides/automate/sensors/using-resources-in-sensors.md index d8db6260abd65..4ded844826ee6 100644 --- a/docs/docs-beta/docs/guides/automate/sensors/using-resources-in-sensors.md +++ b/docs/docs-beta/docs/guides/automate/sensors/using-resources-in-sensors.md @@ -9,51 +9,7 @@ To specify resource dependencies, annotate the resource as a parameter to the se Here, a resource is provided which provides access to an external API. The same resource could be used in the job or assets that the sensor triggers. -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_on_sensor endbefore=end_new_resource_on_sensor dedent=4 -from dagster import ( - sensor, - RunRequest, - SensorEvaluationContext, - ConfigurableResource, - job, - Definitions, - RunConfig, -) -import requests -from typing import List - -class UsersAPI(ConfigurableResource): - url: str - - def fetch_users(self) -> list[str]: - return requests.get(self.url).json() - -@job -def process_user(): ... - -@sensor(job=process_user) -def process_new_users_sensor( - context: SensorEvaluationContext, - users_api: UsersAPI, -): - last_user = int(context.cursor) if context.cursor else 0 - users = users_api.fetch_users() - - num_users = len(users) - for user_id in users[last_user:]: - yield RunRequest( - run_key=user_id, - tags={"user_id": user_id}, - ) - - context.update_cursor(str(num_users)) - -defs = Definitions( - jobs=[process_user], - sensors=[process_new_users_sensor], - resources={"users_api": UsersAPI(url="https://my-api.com/users")}, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + For more information on resources, refer to the [Resources documentation](/guides/build/external-resources). To see how to test schedules with resources, refer to the section on testing sensors with resources in "[Testing sensors](testing-sensors)". From 3e96b687b8112b77960f9de3883e3d6c8f889dd0 Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 00:16:47 -0700 Subject: [PATCH 4/9] convert example code in build docs Signed-off-by: nikki everett --- .../docs/guides/build/assets/asset-jobs.md | 22 +- .../assets/asset-versioning-and-caching.md | 139 +----- .../assets/metadata-and-tags/kind-tags.md | 24 +- .../external-pipelines/aws-lambda-pipeline.md | 1 - .../external-pipelines/pyspark-pipeline.md | 196 +------- .../create-subprocess-asset.md | 97 +--- .../using-dagster-pipes/index.md | 15 +- .../modify-external-code.md | 180 +------ .../using-dagster-pipes/reference.md | 465 +----------------- .../configuring-resources.md | 112 +---- .../external-resources/connecting-to-apis.md | 2 - .../external-resources/defining-resources.md | 58 +-- .../managing-resource-state.md | 68 +-- .../testing-configurable-resources.md | 62 +-- .../using-bare-python-objects-as-resources.md | 19 +- .../defining-a-custom-io-manager.md | 199 +------- .../backfilling-data.md | 24 +- .../partitioning-assets.md | 2 - 18 files changed, 87 insertions(+), 1598 deletions(-) diff --git a/docs/docs-beta/docs/guides/build/assets/asset-jobs.md b/docs/docs-beta/docs/guides/build/assets/asset-jobs.md index 821b8c21f6b9a..e5b9fac4a5f55 100644 --- a/docs/docs-beta/docs/guides/build/assets/asset-jobs.md +++ b/docs/docs-beta/docs/guides/build/assets/asset-jobs.md @@ -9,7 +9,6 @@ Jobs are the main unit of execution and monitoring for [asset definitions](/guid - At fixed intervals, by [schedules](/guides/automate/schedules) - When external changes occur, using [sensors](/guides/automate/sensors) - ## Creating asset jobs In this section, we'll demonstrate how to create a few asset jobs that target the following assets: @@ -29,26 +28,7 @@ You can target one or multiple assets, or create multiple jobs that target overl Including the jobs in a [`Definitions`](/api/python-api/definitions) object located at the top level of a Python module or file makes asset jobs available to the UI, GraphQL, and the command line. The Dagster tool loads that module as a code location. If you include schedules or sensors, the [code location](/guides/deploy/code-locations) will automatically include jobs that those schedules or sensors target. -```python file=/concepts/assets/jobs_to_definitions.py -import dagster as dg - - -@dg.asset -def number_asset(): - yield dg.MaterializeResult( - metadata={ - "number": 1, - } - ) - - -number_asset_job = dg.define_asset_job(name="number_asset_job", selection="number_asset") - -defs = dg.Definitions( - assets=[number_asset], - jobs=[number_asset_job], -) -``` + ## Testing asset jobs diff --git a/docs/docs-beta/docs/guides/build/assets/asset-versioning-and-caching.md b/docs/docs-beta/docs/guides/build/assets/asset-versioning-and-caching.md index 5ba4454d4a2a3..6351fd85c3508 100644 --- a/docs/docs-beta/docs/guides/build/assets/asset-versioning-and-caching.md +++ b/docs/docs-beta/docs/guides/build/assets/asset-versioning-and-caching.md @@ -36,15 +36,8 @@ By default, Dagster automatically computes a data version for each materializati Let's start with a trivial asset that returns a hardcoded number: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/vanilla_asset.py -from dagster import asset - -@asset -def a_number(): - return 1 -``` + Next, start the Dagster UI: @@ -66,15 +59,7 @@ If you materialize the asset again, you'll notice that both the code version and Let's improve this situation by setting an explicit code version. Add a `code_version` on the asset: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/vanilla_asset_with_code_version.py -from dagster import asset - - -@asset(code_version="v1") -def versioned_number(): - return 1 -``` + Now, materialize the asset. The user-defined code version `v1` will be associated with the latest materialization: @@ -82,15 +67,7 @@ Now, materialize the asset. The user-defined code version `v1` will be associate Now, let's update the code and inform Dagster that the code has changed. Do this by changing the `code_version` argument: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/vanilla_asset_with_code_version_v2.py -from dagster import asset - - -@asset(code_version="v2") -def versioned_number(): - return 11 -``` + Click **Reload definitions** to pick up the changes. @@ -104,20 +81,8 @@ The `versioned_number` asset must be materialized again to become up-to-date. Cl Tracking changes becomes more powerful when there are dependencies in play. Let's add an asset downstream of our first asset: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/dependencies_code_version_only.py -from dagster import asset - -@asset(code_version="v2") -def versioned_number(): - return 11 - - -@asset(code_version="v1") -def multiplied_number(versioned_number): - return versioned_number * 2 -``` + In the Dagster UI, click **Reload definitions**. The `multipled_number` asset will be marked as **Never materialized**. @@ -129,20 +94,7 @@ In the created run, only the step associated with `multiplied_number` is run. Th Now, let's update the `versioned_number` asset. Specifically, we'll change its return value and code version: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/dependencies_code_version_only_v2.py -from dagster import asset - - -@asset(code_version="v3") -def versioned_number(): - return 15 - - -@asset(code_version="v1") -def multiplied_number(versioned_number): - return versioned_number * 2 -``` + As before, this will cause `versioned_number` to get a label indicating that its code version has changed since its latest materialization. But since `multiplied_number` depends on `versioned_number`, it must be recomputed as well and so gets a label indicating that the code version of an upstream asset has changed. If you hover over the **Upstream code version** tag on `multiplied_number`, you will see the upstream asset whose code version has changed: @@ -158,21 +110,7 @@ For example, when a materialization function contains an element of randomness, Dagster accommodates these and similar scenarios by allowing user code to supply its own data versions. To do so, include the data version alongside the returned asset value in an object. Let's update `versioned_number` to do this. For simplicity, you'll use the stringified return value as the data version: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/manual_data_versions_1.py -from dagster import DataVersion, Output, asset - - -@asset(code_version="v4") -def versioned_number(): - value = 20 - return Output(value, data_version=DataVersion(str(value))) - - -@asset(code_version="v1") -def multiplied_number(versioned_number): - return versioned_number * 2 -``` + Both assets get labels to indicate that they're impacted by the new code version of `versioned_number`. Let's re-materialize them both to make them fresh. Notice the `DataVersion` of `versioned_number` is now `20`: @@ -180,21 +118,7 @@ Both assets get labels to indicate that they're impacted by the new code version Let's simulate a cosmetic refactor by updating `versioned_number` again, but without changing the returned value. Bump the code version to `v5` and change `20` to `10 + 10`: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/manual_data_versions_2.py -from dagster import DataVersion, Output, asset - - -@asset(code_version="v5") -def versioned_number(): - value = 10 + 10 - return Output(value, data_version=DataVersion(str(value))) - - -@asset(code_version="v1") -def multiplied_number(versioned_number): - return versioned_number * 2 -``` + Once again, both assets have labels to indicate the change in the code version. Dagster doesn't know that `v5` of the versioned number will return the same value as `v4`, as it only knows about code versions and data versions. @@ -214,52 +138,11 @@ External data sources in Dagster are modeled by called `input_number`. This will represent a file written by an external process upstream of our pipeline: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/input_number.txt -29034 -``` + The body of the `input_number` function computes a hash of the file contents and returns it as a `DataVersion`. We'll set `input_number` as an upstream dependency of `versioned_number` and have `versioned_number` return the value it reads from the file: -{/* TODO convert to */} -```python file=/guides/dagster/asset_versioning_and_caching/observable_source_asset_path_with_non_argument_deps.py -from hashlib import sha256 - -from dagster import ( - DataVersion, - Output, - asset, - file_relative_path, - observable_source_asset, -) - - -def sha256_digest_from_str(string: str) -> str: - hash_sig = sha256() - hash_sig.update(bytearray(string, "utf8")) - return hash_sig.hexdigest() - - -FILE_PATH = file_relative_path(__file__, "input_number.txt") - - -@observable_source_asset -def input_number(): - with open(FILE_PATH) as ff: - return DataVersion(sha256_digest_from_str(ff.read())) - - -@asset(code_version="v6", deps=[input_number]) -def versioned_number(): - with open(FILE_PATH) as ff: - value = int(ff.read()) - return Output(value, data_version=DataVersion(str(value))) - - -@asset(code_version="v1") -def multiplied_number(versioned_number): - return versioned_number * 2 -``` + Adding an observable source asset to an asset graph will cause a new button, **Observe sources**, to appear: @@ -275,8 +158,6 @@ We also see that `versioned_number` and `multiplied_number` have labels indicati Finally, let's manually alter the file to simulate the activity of an external process. Change the content of `input_number.txt`: -```python file=/guides/dagster/asset_versioning_and_caching/input_number_v2.txt -15397 -``` + If we click the **Observe Sources** button again, the downstream assets will again have labels indicating that upstream data has changed. The observation run generated a new data version for `input_number` because its content changed. diff --git a/docs/docs-beta/docs/guides/build/assets/metadata-and-tags/kind-tags.md b/docs/docs-beta/docs/guides/build/assets/metadata-and-tags/kind-tags.md index 6d3f9ef2d8097..8b0a7597a987b 100644 --- a/docs/docs-beta/docs/guides/build/assets/metadata-and-tags/kind-tags.md +++ b/docs/docs-beta/docs/guides/build/assets/metadata-and-tags/kind-tags.md @@ -10,31 +10,11 @@ Kind tags can help you quickly identify the underlying system or technology used You may add up to three kinds to the `kinds` argument of an , which can be useful to represent multiple technologies or systems that an asset is associated with. For example, an asset which is built by Python code and stored in Snowflake can be tagged with both `python` and `snowflake` kinds: -{/* TODO convert to */} -```python file=/concepts/metadata-tags/asset_kinds.py -from dagster import asset - - -@asset(kinds={"python", "snowflake"}) -def my_asset(): - pass -``` + Kinds can also be specified on an , for use in multi-assets: -```python file=/concepts/metadata-tags/asset_kinds_multi.py -from dagster import AssetSpec, multi_asset - - -@multi_asset( - specs=[ - AssetSpec("foo", kinds={"python", "snowflake"}), - AssetSpec("bar", kinds={"python", "postgres"}), - ] -) -def my_multi_asset(): - pass -``` + On the backend, these kind inputs are stored as tags on the asset. For more information, see [Tags](/guides/build/assets/metadata-and-tags/index.md#tags). diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/aws-lambda-pipeline.md b/docs/docs-beta/docs/guides/build/external-pipelines/aws-lambda-pipeline.md index 843c927291eb6..af2dd90efac91 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/aws-lambda-pipeline.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/aws-lambda-pipeline.md @@ -88,7 +88,6 @@ For simplicity, we're going to copy the contents of the single Dagster Pipes fil In this step, you'll add the code you want to execute to the function. Create another file in the AWS UI - or use the default `lambda_function.py` file created by the function - and paste in the following code: -{/* TODO convert to */} :::tip diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/pyspark-pipeline.md b/docs/docs-beta/docs/guides/build/external-pipelines/pyspark-pipeline.md index e48475b295987..b3051602784b1 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/pyspark-pipeline.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/pyspark-pipeline.md @@ -46,94 +46,14 @@ We will set up a few non-default Pipes components to streamline the otherwise ch 1. Let's start by creating the asset and opening a Pipes session. We will be using S3 to pass Pipes messages from the Spark job to Dagster, so we will create `PipesS3MessageReader` and `PipesS3ContextInjector` objects. (Technically, it's not strictly required to use S3 for passing the Dagster context, but storing it there will decrease the CLI arguments size). -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/pyspark/dagster_code.py startafter=start_pipes_session_marker endbefore=end_pipes_session_marker -import os -import subprocess -from collections.abc import Mapping, Sequence -from pathlib import Path - -import boto3 -from dagster_aws.pipes import PipesS3ContextInjector, PipesS3MessageReader - -import dagster as dg - -LOCAL_SCRIPT_PATH = Path(__file__).parent / "script.py" - - -@dg.asset -def pipes_spark_asset(context: dg.AssetExecutionContext): - s3_client = boto3.client("s3") - - bucket = os.environ["DAGSTER_PIPES_BUCKET"] - - # upload the script to S3 - # ideally, this should be done via CI/CD processes and not in the asset body - # but for the sake of this example we are doing it here - s3_script_path = f"{context.dagster_run.run_id}/pyspark_script.py" - s3_client.upload_file(LOCAL_SCRIPT_PATH, bucket, s3_script_path) - - context_injector = PipesS3ContextInjector( - client=s3_client, - bucket=bucket, - ) - - message_reader = PipesS3MessageReader( - client=s3_client, - bucket=bucket, - # the following setting will configure the Spark job to collect logs from the driver - # and send them to Dagster via Pipes - include_stdio_in_messages=True, - ) -``` + Notice how `PipesS3MessageReader` has `include_stdio_in_messages=True`. This setting will configure the Pipes **message writer** in the Spark job to collect logs from the Spark driver and send them to Dagster via Pipes messages. 2. We will be using CLI arguments to pass the bootstrap information from Dagster to the Spark job. We will fetch them from the `session.get_bootstrap_cli_arguments` method. We pass these arguments to `spark-submit` along with a few other settings. -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/pyspark/dagster_code.py startafter=end_pipes_session_marker endbefore=start_definitions_marker -# pipes_spark_asset body continues below - with dg.open_pipes_session( - context=context, - message_reader=message_reader, - context_injector=context_injector, - ) as session: - dagster_pipes_args = " ".join( - # prepare Pipes bootstrap CLI arguments - [ - f"{key} {value}" - for key, value in session.get_bootstrap_cli_arguments().items() - ] - ) - - cmd = " ".join( - [ - "spark-submit", - # change --master and --deploy-mode according to specific Spark setup - "--master", - "local[*]", - "--conf", - "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", - # custom S3 endpoint for MinIO - "--conf", - "spark.hadoop.fs.s3a.endpoint=http://minio:9000", - "--conf", - "spark.hadoop.fs.s3a.path.style.access=true", - f"s3a://{bucket}/{s3_script_path}", - dagster_pipes_args, - ] - ) - - subprocess.run( - # we do not forward stdio on purpose to demonstrate how Pipes collect logs from the driver - cmd, - shell=True, - check=True, - ) - - return session.get_results() -``` + + :::note @@ -145,50 +65,7 @@ In other Pipes workflows, passing the bootstrap information from Dagster to the First, create a new file named `script.py`, then add the following code to create a context that can be used to send messages to Dagster: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/pyspark/script.py startafter -import boto3 -from dagster_pipes import ( - PipesCliArgsParamsLoader, - PipesS3ContextLoader, - PipesS3MessageWriter, - open_dagster_pipes, -) -from pyspark.sql import SparkSession - - -def main(): - with open_dagster_pipes( - message_writer=PipesS3MessageWriter(client=boto3.client("s3")), - context_loader=PipesS3ContextLoader(client=boto3.client("s3")), - params_loader=PipesCliArgsParamsLoader(), - ) as pipes: - print("Hello from the Spark driver!") - - pipes.log.info("I am logging a Dagster message from the Spark driver!") - - spark = SparkSession.builder.appName("HelloWorld").getOrCreate() - - df = spark.createDataFrame( - [(1, "Alice", 34), (2, "Bob", 45), (3, "Charlie", 56)], - ["id", "name", "age"], - ) - - # calculate a really important statistic - avg_age = float(df.agg({"age": "avg"}).collect()[0][0]) - - # attach it to the asset materialization in Dagster - pipes.report_asset_materialization( - metadata={"average_age": {"raw_value": avg_age, "type": "float"}}, - data_version="alpha", - ) - - spark.stop() - - -if __name__ == "__main__": - main() -``` + Note how `PipesCliArgsParamsLoader` is used to load the CLI arguments passed by Dagster. This information will be used to automatically configure `PipesS3MessageWriter` and `PipesS3ContextLoader`. @@ -229,70 +106,7 @@ COPY dagster_code.py script.py ./ 3. Create a `docker-compose.yml`: -{/* TODO convert to */} -```yaml file=/guides/dagster/dagster_pipes/pyspark/docker-compose.yml -# this docker compose file creates a mini Spark cluster with 1 master and 2 workers to simulate a distributed environment - -volumes: - spark-logs: - spark-data: - minio-data: - dagster_home: - -networks: - spark: - -services: - minio: - image: bitnami/minio - ports: - - "9000:9000" - - "9001:9001" - environment: - MINIO_ROOT_USER: minio - MINIO_ROOT_PASSWORD: minio123 - MINIO_DEFAULT_BUCKETS: "dagster-pipes:public" - volumes: - - minio-data:/data - networks: - - spark - - dagster-dev: - develop: - watch: - - action: sync - path: . - target: /src - build: - context: . - dockerfile: Dockerfile - command: - - "dagster" - - "dev" - - "-f" - - "/src/dagster_code.py" - - "--host" - - "0.0.0.0" - - "--port" - - "3000" - ports: - - "3000:3000" - volumes: - - spark-logs:/spark/logs - - spark-data:/spark/data - - dagster_home:/dagster_home - environment: - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ENDPOINT_URL: http://minio:9000 - DAGSTER_PIPES_BUCKET: dagster-pipes - - depends_on: - - minio - - networks: - - spark -``` + 4. Start the Dagster dev instance inside Docker: diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/create-subprocess-asset.md b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/create-subprocess-asset.md index 6a832aaabdefc..2c70ae53c0986 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/create-subprocess-asset.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/create-subprocess-asset.md @@ -18,20 +18,7 @@ In this part of the tutorial, you'll create a Dagster asset that, in its executi Before getting started, make sure you have fulfilled all the [prerequisites](index.md#prerequisites) for the tutorial. You should have a standalone Python script named `external_code.py` which looks like the following: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/external_code.py lines=2- -import pandas as pd - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - print(f"processing total {total_orders} orders") - - -if __name__ == "__main__": - main() -``` + ### Step 1.1: Define the asset @@ -39,25 +26,8 @@ First, create a new file named `dagster_code.py` in the same directory as the `e Next, you’ll define the asset. Copy and paste the following into the file: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/dagster_code.py startafter=start_asset_marker endbefore=end_asset_marker lines=-16 -import shutil -from dagster import ( - AssetExecutionContext, - MaterializeResult, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -) -> MaterializeResult: - cmd = [shutil.which("python"), file_relative_path(__file__, "external_code.py")] -``` + Here’s what we did in this example: @@ -72,28 +42,8 @@ Here’s what we did in this example: Then, invoke a subprocess that executes the external code from the asset using the `pipes_subprocess_client` resource: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/dagster_code.py startafter=start_asset_marker endbefore=end_asset_marker -import shutil - -from dagster import ( - AssetExecutionContext, - MaterializeResult, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -) -> MaterializeResult: - cmd = [shutil.which("python"), file_relative_path(__file__, "external_code.py")] - return pipes_subprocess_client.run( - command=cmd, context=context - ).get_materialize_result() -``` + + Let’s take a look at what this code does: @@ -108,46 +58,11 @@ To make the asset and subprocess resource loadable and accessible by Dagster's t Copy and paste the following to the bottom of `dagster_code.py`: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/dagster_code.py startafter=start_definitions_marker endbefore=end_definitions_marker -from dagster import Definitions - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + At this point, `dagster_code.py` should look like the following: -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/dagster_code_finished.py -import shutil - -from dagster import ( - AssetExecutionContext, - Definitions, - MaterializeResult, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -) -> MaterializeResult: - cmd = [shutil.which("python"), file_relative_path(__file__, "external_code.py")] - return pipes_subprocess_client.run( - command=cmd, context=context - ).get_materialize_result() - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + ## Step 3: Run the subprocess from the Dagster UI diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/index.md b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/index.md index 55aeae654c1ab..76a390b74f492 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/index.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/index.md @@ -26,20 +26,7 @@ You'll also need **an existing Python script.** We’ll use the following Python Create a file named `external_code.py` and paste the following into it: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_1/external_code.py lines=2- -import pandas as pd - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - print(f"processing total {total_orders} orders") - - -if __name__ == "__main__": - main() -``` + ## Ready to get started? diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/modify-external-code.md b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/modify-external-code.md index d52c5e9321e0e..0062dc3c5ec74 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/modify-external-code.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/modify-external-code.md @@ -35,49 +35,14 @@ Getting external code to send information back to Dagster via Dagster Pipes requ In our sample Python script, the changes would look like the following: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_1/external_code.py lines=2- -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - print(f"processing total {total_orders} orders") - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + ## Step 2: Send log messages to Dagster Dagster Pipes context offers a built-in logging capability that enables you to stream log messages back to Dagster. Instead of printing to the standard output, you can use the `context.log` method on to send log messages back to Dagster. In this case, we’re sending an `info` level log message: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_2/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - context.log.info(f"processing total {total_orders} orders") - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + Then, the log messages will show up in the **Run details** page of the Dagster UI. You can filter the log levels to only view `info` level messages: @@ -96,26 +61,8 @@ Similar to [reporting materialization metadata within the Dagster process](/guid In this example, we’re passing a piece of metadata named `total_orders` to the `metadata` parameter of the . This payload will be sent from the external process back to Dagster: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_3_materialization/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - # send structured metadata back to Dagster - context.report_asset_materialization(metadata={"total_orders": total_orders}) - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + Then, `total_orders` will show up in the UI as structured metadata: @@ -134,69 +81,13 @@ If your asset has data quality checks defined, you can report to Dagster that an -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess//part_2/step_3_check/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - # send structured metadata back to Dagster - context.report_asset_materialization(metadata={"total_orders": total_orders}) - # report data quality check result back to Dagster - context.report_asset_check( - passed=orders_df[["item_id"]].notnull().all().bool(), - check_name="no_empty_order_check", - ) - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_3_check/dagster_code.py -import shutil - -from dagster import ( - AssetCheckSpec, - AssetExecutionContext, - Definitions, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset( - check_specs=[AssetCheckSpec(name="no_empty_order_check", asset="subprocess_asset")], -) -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -): - cmd = [ - shutil.which("python"), - file_relative_path(__file__, "external_code.py"), - ] - return pipes_subprocess_client.run( - command=cmd, context=context - ).get_materialize_result() - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + @@ -215,69 +106,12 @@ At this point, your two files should look like the following: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_3_check/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - # send structured metadata back to Dagster - context.report_asset_materialization(metadata={"total_orders": total_orders}) - # report data quality check result back to Dagster - context.report_asset_check( - passed=orders_df[["item_id"]].notnull().all().bool(), - check_name="no_empty_order_check", - ) - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/part_2/step_3_check/dagster_code.py -import shutil - -from dagster import ( - AssetCheckSpec, - AssetExecutionContext, - Definitions, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset( - check_specs=[AssetCheckSpec(name="no_empty_order_check", asset="subprocess_asset")], -) -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -): - cmd = [ - shutil.which("python"), - file_relative_path(__file__, "external_code.py"), - ] - return pipes_subprocess_client.run( - command=cmd, context=context - ).get_materialize_result() - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + diff --git a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/reference.md b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/reference.md index 105db99e59ff6..bf9de0ebcf27f 100644 --- a/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/reference.md +++ b/docs/docs-beta/docs/guides/build/external-pipelines/using-dagster-pipes/reference.md @@ -14,32 +14,7 @@ When launching the subprocess, you may want to make environment variables or add In the external code, you can access extras via the `PipesContext` object: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_extras_env/external_code.py lines=2- -import os - -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - total_orders = len(orders_df) - # get the Dagster Pipes context - context = PipesContext.get() - # get all extras provided by Dagster asset - print(context.extras) - # get the value of an extra - print(context.get_extra("foo")) - # get env var - print(os.environ["MY_ENV_VAR_IN_SUBPROCESS"]) - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + @@ -48,40 +23,7 @@ The `run` method to the `PipesSubprocessClient` resource also accepts `env` and Note: We're using `os.environ` in this example, but Dagster's recommendation is to use in production. -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_extras_env/dagster_code.py -import shutil - -from dagster import ( - AssetExecutionContext, - Definitions, - MaterializeResult, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -) -> MaterializeResult: - cmd = [shutil.which("python"), file_relative_path(__file__, "external_code.py")] - return pipes_subprocess_client.run( - command=cmd, - context=context, - extras={"foo": "bar"}, - env={ - "MY_ENV_VAR_IN_SUBPROCESS": "my_value", - }, - ).get_materialize_result() - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + @@ -96,74 +38,14 @@ Sometimes, you may not want to materialize an asset, but instead want to report From the external code, you can report to Dagster that an asset check has been performed via . Note that `asset_key` in this case is required, and must match the asset key defined in : -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_asset_check/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame({"order_id": [1, 2], "item_id": [432, 878]}) - # get the Dagster Pipes context - context = PipesContext.get() - # send structured metadata back to Dagster - context.report_asset_check( - asset_key="my_asset", - passed=orders_df[["item_id"]].notnull().all().bool(), - check_name="no_empty_order_check", - ) - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + On Dagster's side, the `PipesClientCompletedInvocation` object returned from `PipesSubprocessClient` includes a `get_asset_check_result` method, which you can use to access the event reported by the subprocess. -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_asset_check/dagster_code.py -import shutil - -from dagster import ( - AssetCheckExecutionContext, - AssetCheckResult, - Definitions, - MaterializeResult, - PipesSubprocessClient, - asset, - asset_check, - file_relative_path, -) - - -@asset -def my_asset(): ... - - -@asset_check(asset="my_asset") -def no_empty_order_check( - context: AssetCheckExecutionContext, pipes_subprocess_client: PipesSubprocessClient -) -> AssetCheckResult: - cmd = [ - shutil.which("python"), - file_relative_path(__file__, "external_code.py"), - ] - return pipes_subprocess_client.run( - command=cmd, context=context.op_execution_context - ).get_asset_check_result() - - -defs = Definitions( - assets=[my_asset], - asset_checks=[no_empty_order_check], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + @@ -184,35 +66,7 @@ Calling {method} with asset key {asset_key} is undefined. Asset has already been Instead, you’ll need to set the `asset_key` parameter for each instance of : -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_multi_asset/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - orders_df = pd.DataFrame( - {"order_id": [1, 2, 3], "item_id": [432, 878, 102], "user_id": ["a", "b", "a"]} - ) - total_orders = len(orders_df) - total_users = orders_df["user_id"].nunique() - - # get the Dagster Pipes context - context = PipesContext.get() - # send structured metadata back to Dagster. asset_key is required when there are multiple assets - context.report_asset_materialization( - asset_key="orders", metadata={"total_orders": total_orders} - ) - context.report_asset_materialization( - asset_key="users", metadata={"total_users": total_users} - ) - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + @@ -220,36 +74,7 @@ if __name__ == "__main__": In the Dagster code, you can use to define a single asset that represents multiple assets. The `PipesClientCompletedInvocation` object returned from `PipesSubprocessClient` includes a `get_results` method, which you can use to access all the events, such as multiple and , reported by the subprocess: -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/with_multi_asset/dagster_code.py -import shutil - -from dagster import ( - AssetExecutionContext, - AssetSpec, - Definitions, - PipesSubprocessClient, - file_relative_path, - multi_asset, -) - - -@multi_asset(specs=[AssetSpec("orders"), AssetSpec("users")]) -def subprocess_asset( - context: AssetExecutionContext, pipes_subprocess_client: PipesSubprocessClient -): - cmd = [ - shutil.which("python"), - file_relative_path(__file__, "external_code.py"), - ] - return pipes_subprocess_client.run(command=cmd, context=context).get_results() - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + @@ -263,93 +88,14 @@ Sometimes, you may want to pass data back from the external process for use in t In the external code, we send messages using `report_custom_message`. The message can be any data that is JSON serializable. -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/custom_messages/external_code.py -import pandas as pd -from dagster_pipes import PipesContext, open_dagster_pipes - - -def main(): - # get the Dagster Pipes context - context = PipesContext.get() - - # compute the full orders data - orders = pd.DataFrame( - { - "order_id": [1, 2, 3], - "item_id": [321, 654, 987], - "order_details": [..., ..., ...], # imagine large data, - # and more columns - } - ) - - # send a smaller table to be I/O managed by Dagster and passed to downstream assets - summary_table = pd.DataFrame(orders[["order_id", "item_id"]]) - context.report_custom_message(summary_table.to_dict()) - - context.report_asset_materialization(metadata={"total_orders": len(orders)}) - - -if __name__ == "__main__": - # connect to Dagster Pipes - with open_dagster_pipes(): - main() -``` + In the Dagster code we receive custom messages using `get_custom_messages`. -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/custom_messages/dagster_code.py -import shutil - -import pandas as pd - -from dagster import ( - AssetExecutionContext, - Definitions, - Output, - PipesSubprocessClient, - asset, - file_relative_path, -) - - -@asset -def subprocess_asset( - context: AssetExecutionContext, - pipes_subprocess_client: PipesSubprocessClient, -) -> Output[pd.DataFrame]: - cmd = [shutil.which("python"), file_relative_path(__file__, "external_code.py")] - result = pipes_subprocess_client.run( - command=cmd, - context=context, - ) - - # a small summary table gets reported as a custom message - messages = result.get_custom_messages() - if len(messages) != 1: - raise Exception("summary not reported") - - summary_df = pd.DataFrame(messages[0]) - - # grab any reported metadata off of the materialize result - metadata = result.get_materialize_result().metadata - - # return the summary table to be loaded by Dagster for downstream assets - return Output( - value=summary_df, - metadata=metadata, - ) - - -defs = Definitions( - assets=[subprocess_asset], - resources={"pipes_subprocess_client": PipesSubprocessClient()}, -) -``` + @@ -367,213 +113,36 @@ Below are examples of specifying data for all supported metadata types. Float, i #### URL Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/url_metadata.py -def get_url(args): ... - - -context = ... - -# start_url -# Within the Dagster pipes subprocess: -url = "http://example.com" -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"url_meta": {"type": "url", "raw_value": url}}, -) -# end_url -``` + #### Path Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/path_metadata.py -def get_path(args): ... - - -context = ... - -# start_path -# Within the Dagster pipes subprocess: -path = "/path/to/file.txt" -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"path_meta": {"type": "path", "raw_value": path}}, -) -# end_path -``` + #### Notebook Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/notebook_metadata.py -def get_notebook_path(args): ... - - -context = ... - -# start_notebook -# Within the Dagster pipes subprocess: -notebook_path = "/path/to/notebook.ipynb" -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"notebook_meta": {"type": "notebook", "raw_value": notebook_path}}, -) -# end_notebook -``` + #### JSON Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/json_metadata.py -def get_json_data(args): ... - - -context = ... - -# start_json -# Within the Dagster pipes subprocess: -json_data = ["item1", "item2", "item3"] -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"json_meta": {"type": "json", "raw_value": json_data}}, -) -# end_json -``` + #### Markdown Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/markdown_metadata.py -def get_markdown_content(args): ... - - -context = ... - -# start_markdown -# Within the Dagster pipes subprocess: -markdown_content = "# Header\nSome **bold** text" -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"md_meta": {"type": "md", "raw_value": markdown_content}}, -) -# end_markdown -``` + #### Table Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/table_metadata.py -context = ... - -# start_table -# Within the Dagster pipes subprocess: -schema = [ - { - "name": "column1", - "type": "string", - "description": "The first column", - "tags": {"source": "source1"}, - "constraints": {"unique": True}, - }, - { - "name": "column2", - "type": "int", - "description": "The second column", - "tags": {"source": "source2"}, - "constraints": {"min": 0, "max": 100}, - }, -] -records = [ - {"column1": "foo", "column2": 1}, - {"column1": "bar", "column2": 2}, -] -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={ - "table_meta": { - "type": "table", - "raw_value": {"schema": schema, "records": records}, - } - }, -) -# end_table -``` + #### Table Schema Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/table_schema_metadata.py -context = ... - -# start_table_schema -# Within the Dagster pipes subprocess: -schema = [ - { - "name": "column1", - "type": "string", - "description": "The first column", - "tags": {"source": "source1"}, - "constraints": {"unique": True}, - }, - { - "name": "column2", - "type": "int", - "description": "The second column", - "tags": {"source": "source2"}, - "constraints": {"min": 0, "max": 100}, - }, -] - -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={ - "table_meta": { - "type": "table_schema", - "raw_value": schema, - } - }, -) -# end_table_schema -``` + #### Table Column Lineage Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/table_column_lineage.py startafter=start_table_column_lineage endbefore=end_table_column_lineage -# Within the Dagster pipes subprocess: -lineage = { - "a": [{"asset_key": "upstream", "column": "column1"}], - "b": [{"asset_key": "upstream", "column": "column2"}], -} -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={ - "lineage_meta": { - "type": "table_column_lineage", - "raw_value": {"table_column_lineage": lineage}, - } - }, -) -``` + #### Timestamp Metadata -{/* TODO convert to */} -```python file=/guides/dagster/dagster_pipes/subprocess/rich_metadata/timestamp_metadata.py startafter=start_timestamp endbefore=end_timestamp -# Within the Dagster pipes subprocess: -timestamp = 1234567890 -# Then, when reporting the asset materialization: -context.report_asset_materialization( - asset_key="foo", - metadata={"timestamp_meta": {"type": "timestamp", "raw_value": timestamp}}, -) -``` + diff --git a/docs/docs-beta/docs/guides/build/external-resources/configuring-resources.md b/docs/docs-beta/docs/guides/build/external-resources/configuring-resources.md index ce6dc04075d8f..495d406bbce06 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/configuring-resources.md +++ b/docs/docs-beta/docs/guides/build/external-resources/configuring-resources.md @@ -11,23 +11,10 @@ Resources can be configured using environment variables, which is useful for sec To use environment variables, pass an when constructing the resource. `EnvVar` inherits from `str` and can be used to populate any string config field on a resource. The value of the environment variable will be evaluated when a run is launched. -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resources_env_vars endbefore=end_new_resources_env_vars dedent=4 -from dagster import EnvVar, Definitions, ConfigurableResource - -class CredentialsResource(ConfigurableResource): - username: str - password: str - -defs = Definitions( - assets=..., - resources={ - "credentials": CredentialsResource( - username=EnvVar("MY_USERNAME"), - password=EnvVar("MY_PASSWORD"), - ) - }, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + + +:::note **What about `os.getenv()`?** When `os.getenv()` is used, the value of the variable is retrieved when Dagster loads the code location. Using `EnvVar` not only tells Dagster to retrieve the value at runtime, but also not to display the value in the UI. @@ -35,53 +22,23 @@ defs = Definitions( For more information on using environment variables with Dagster, refer to the [Environment variables guide](/guides/deploy/using-environment-variables-and-secrets). +::: + ## Configuring resources at launch time In some cases, you may want to specify configuration for a resource at launch time, in the Launchpad or in a for a [schedule](/guides/automate/schedules/) or [sensor](/guides/automate/sensors/). For example, you may want a sensor-triggered run to specify a different target table in a database resource for each run. You can use the `configure_at_launch()` method to defer the construction of a configurable resource until launch time: -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_runtime endbefore=end_new_resource_runtime dedent=4 -from dagster import ConfigurableResource, Definitions, asset - -class DatabaseResource(ConfigurableResource): - table: str - - def read(self): ... - -@asset -def data_from_database(db_conn: DatabaseResource): - return db_conn.read() - -defs = Definitions( - assets=[data_from_database], - resources={"db_conn": DatabaseResource.configure_at_launch()}, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + ### Providing resource launch time configuration in Python code Then, configuration for the resource can be provided at launch time in the Launchpad or in Python code using the `config` parameter of the : -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_runtime_launch endbefore=end_new_resource_runtime_launch dedent=4 -from dagster import sensor, define_asset_job, RunRequest, RunConfig - -update_data_job = define_asset_job( - name="update_data_job", selection=[data_from_database] -) - -@sensor(job=update_data_job) -def table_update_sensor(): - tables = ... - for table_name in tables: - yield RunRequest( - run_config=RunConfig( - resources={ - "db_conn": DatabaseResource(table=table_name), - }, - ), - ) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + ## Resources that depend on other resources @@ -89,57 +46,14 @@ In some situations, you may want to define a resource that depends on other reso In this case, you can list that nested resource as an attribute of the resource class: -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resources_nesting endbefore=end_new_resources_nesting dedent=4 -from dagster import Definitions, ConfigurableResource, ResourceDependency - -class CredentialsResource(ConfigurableResource): - username: str - password: str - -class FileStoreBucket(ConfigurableResource): - credentials: ResourceDependency[CredentialsResource] - region: str - - def write(self, data: str): - # We can access the credentials resource via `self.credentials`, - # which will be an initialized instance of `CredentialsResource` - get_filestore_client( - username=self.credentials.username, - password=self.credentials.password, - region=self.region, - ).write(data) - -defs = Definitions( - assets=[my_asset], - resources={ - "bucket": FileStoreBucket( - credentials=CredentialsResource( - username="my_user", password="my_password" - ), - region="us-east-1", - ), - }, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + If you prefer to provide the configuration for credentials at launch time, use the `configure_at_launch()` method to defer the construction of the `CredentialsResource` until launch time. Because `credentials` requires launch time configuration through the launchpad, it must also be passed to the object, so that configuration can be provided at launch time. Nested resources only need to be passed to the object if they require launch time configuration. -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_dep_job_runtime endbefore=end_new_resource_dep_job_runtime dedent=4 -credentials = CredentialsResource.configure_at_launch() - -defs = Definitions( - assets=[my_asset], - resources={ - "credentials": credentials, - "bucket": FileStoreBucket( - credentials=credentials, - region="us-east-1", - ), - }, -) -``` + ## Next steps diff --git a/docs/docs-beta/docs/guides/build/external-resources/connecting-to-apis.md b/docs/docs-beta/docs/guides/build/external-resources/connecting-to-apis.md index 05c6aa4bb0b6b..4b513ebb981ce 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/connecting-to-apis.md +++ b/docs/docs-beta/docs/guides/build/external-resources/connecting-to-apis.md @@ -11,8 +11,6 @@ This guide assumes familiarity with [assets](/guides/build/assets/) and [resourc ::: - -

      Prerequisites diff --git a/docs/docs-beta/docs/guides/build/external-resources/defining-resources.md b/docs/docs-beta/docs/guides/build/external-resources/defining-resources.md index 875a1e9687546..16910dca08e66 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/defining-resources.md +++ b/docs/docs-beta/docs/guides/build/external-resources/defining-resources.md @@ -17,31 +17,8 @@ The following example demonstrates defining a subclass of Dict[str, Any]: - return my_conn.request("/fetch_data").json() - -defs = Definitions( - assets=[data_from_service], - resources={ - "my_conn": MyConnectionResource(username="my_user"), - }, -) -``` +{/* TODO add dedent=4 prop when implemented */} + Assets specify resource dependencies by annotating the resource as a parameter to the asset function. @@ -65,34 +42,7 @@ The following example defines a subclass of Response: - return requests.get( - f"https://my-api.com/{endpoint}", - headers={"user-agent": "dagster"}, - ) - -@op -def update_service(my_conn: MyConnectionResource): - my_conn.request("/update") - -@job -def update_service_job(): - update_service() - -defs = Definitions( - jobs=[update_service_job], - resources={ - "my_conn": MyConnectionResource(username="my_user"), - }, -) -``` +{/* TODO add dedent=4 prop when implemented */} + There are many supported config types that can be used when defining resources. Refer to the [advanced config types documentation](/guides/operate/configuration/advanced-config-types) for a more comprehensive overview of the available config types. \ No newline at end of file diff --git a/docs/docs-beta/docs/guides/build/external-resources/managing-resource-state.md b/docs/docs-beta/docs/guides/build/external-resources/managing-resource-state.md index 99145859bcec3..e6dd101c26adc 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/managing-resource-state.md +++ b/docs/docs-beta/docs/guides/build/external-resources/managing-resource-state.md @@ -17,70 +17,10 @@ Once a resource is no longer needed, the `teardown_after_execution` method is ca In the following example, we set up an API token for a client resource based on the username and password provided in the config. The API token can then be used to query an API in the asset body. -```python file=/concepts/resources/pythonic_resources.py startafter=start_with_state_example endbefore=end_with_state_example dedent=4 -from dagster import ConfigurableResource, InitResourceContext, asset -import requests - -from pydantic import PrivateAttr - -class MyClientResource(ConfigurableResource): - username: str - password: str - - _api_token: str = PrivateAttr() - - def setup_for_execution(self, context: InitResourceContext) -> None: - # Fetch and set up an API token based on the username and password - self._api_token = requests.get( - "https://my-api.com/token", auth=(self.username, self.password) - ).text - - def get_all_users(self): - return requests.get( - "https://my-api.com/users", - headers={"Authorization": self._api_token}, - ) - -@asset -def my_asset(client: MyClientResource): - return client.get_all_users() -``` +{/* TODO add dedent=4 prop when implemented */} + For more complex use cases, you can override the `yield_for_execution`. By default, this context manager calls `setup_for_execution`, yields the resource, and then calls `teardown_after_execution`, but you can override it to provide any custom behavior. This is useful for resources that require a context to be open for the duration of a run, such as database connections or file handles. -```python file=/concepts/resources/pythonic_resources.py startafter=start_with_complex_state_example endbefore=end_with_complex_state_example dedent=4 -from dagster import ConfigurableResource, asset, InitResourceContext -from contextlib import contextmanager -from pydantic import PrivateAttr - -class DBConnection: - ... - - def query(self, body: str): ... - -@contextmanager -def get_database_connection(username: str, password: str): ... - -class MyClientResource(ConfigurableResource): - username: str - password: str - - _db_connection: DBConnection = PrivateAttr() - - @contextmanager - def yield_for_execution(self, context: InitResourceContext): - # keep connection open for the duration of the execution - with get_database_connection(self.username, self.password) as conn: - # set up the connection attribute so it can be used in the execution - self._db_connection = conn - - # yield, allowing execution to occur - yield self - - def query(self, body: str): - return self._db_connection.query(body) - -@asset -def my_asset(client: MyClientResource): - client.query("SELECT * FROM my_table") -``` +{/* TODO add dedent=4 prop when implemented */} + diff --git a/docs/docs-beta/docs/guides/build/external-resources/testing-configurable-resources.md b/docs/docs-beta/docs/guides/build/external-resources/testing-configurable-resources.md index 7c57329d2b123..6fc34df614d2c 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/testing-configurable-resources.md +++ b/docs/docs-beta/docs/guides/build/external-resources/testing-configurable-resources.md @@ -5,67 +5,17 @@ sidebar_position: 700 You can test the initialization of a by constructing it manually. In most cases, the resource can be constructed directly: -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_testing endbefore=end_new_resource_testing dedent=4 -from dagster import ConfigurableResource - -class MyResource(ConfigurableResource): - value: str - - def get_value(self) -> str: - return self.value - -def test_my_resource(): - assert MyResource(value="foo").get_value() == "foo" -``` +{/* TODO add dedent=4 prop when implemented */} + If the resource requires other resources, you can pass them as constructor arguments: -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_testing_with_nesting endbefore=end_new_resource_testing_with_nesting dedent=4 -from dagster import ConfigurableResource - -class StringHolderResource(ConfigurableResource): - value: str - -class MyResourceRequiresAnother(ConfigurableResource): - foo: StringHolderResource - bar: str - -def test_my_resource_with_nesting(): - string_holder = StringHolderResource(value="foo") - resource = MyResourceRequiresAnother(foo=string_holder, bar="bar") - assert resource.foo.value == "foo" - assert resource.bar == "bar" -``` +{/* TODO add dedent=4 prop when implemented */} + ## Testing with resource context In the case that a resource uses the resource initialization context, you can use the utility alongside the `with_init_resource_context` helper on the resource class: -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_resource_testing_with_context endbefore=end_new_resource_testing_with_context dedent=4 -from dagster import ( - ConfigurableResource, - build_init_resource_context, - DagsterInstance, -) -from typing import Optional - -class MyContextResource(ConfigurableResource[GitHub]): - base_path: Optional[str] = None - - def effective_base_path(self) -> str: - if self.base_path: - return self.base_path - instance = self.get_resource_context().instance - assert instance - return instance.storage_directory() - -def test_my_context_resource(): - with DagsterInstance.ephemeral() as instance: - context = build_init_resource_context(instance=instance) - assert ( - MyContextResource(base_path=None) - .with_resource_context(context) - .effective_base_path() - == instance.storage_directory() - ) -``` +{/* TODO add dedent=4 prop when implemented */} + diff --git a/docs/docs-beta/docs/guides/build/external-resources/using-bare-python-objects-as-resources.md b/docs/docs-beta/docs/guides/build/external-resources/using-bare-python-objects-as-resources.md index 80ef845e0b7ef..51262f66e1c94 100644 --- a/docs/docs-beta/docs/guides/build/external-resources/using-bare-python-objects-as-resources.md +++ b/docs/docs-beta/docs/guides/build/external-resources/using-bare-python-objects-as-resources.md @@ -9,20 +9,5 @@ Dagster supports passing plain Python objects as resources. This follows a simil {/* TODO replace `ResourceParam` with */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_raw_github_resource endbefore=end_raw_github_resource dedent=4 -from dagster import Definitions, asset, ResourceParam - -# `ResourceParam[GitHub]` is treated exactly like `GitHub` for type checking purposes, -# and the runtime type of the github parameter is `GitHub`. The purpose of the -# `ResourceParam` wrapper is to let Dagster know that `github` is a resource and not an -# upstream asset. - -@asset -def public_github_repos(github: ResourceParam[GitHub]): - return github.organization("dagster-io").repositories() - -defs = Definitions( - assets=[public_github_repos], - resources={"github": GitHub(...)}, -) -``` +{/* TODO add dedent=4 prop when implemented */} + diff --git a/docs/docs-beta/docs/guides/build/io-managers/defining-a-custom-io-manager.md b/docs/docs-beta/docs/guides/build/io-managers/defining-a-custom-io-manager.md index dc6c709a899f8..2eadb7e8fa1a7 100644 --- a/docs/docs-beta/docs/guides/build/io-managers/defining-a-custom-io-manager.md +++ b/docs/docs-beta/docs/guides/build/io-managers/defining-a-custom-io-manager.md @@ -9,24 +9,7 @@ To define an I/O manager, extend the The provided `context` argument for `handle_output` is an . The provided `context` argument for `load_input` is an . The linked API documentation lists all the fields that are available on these objects. @@ -36,64 +19,14 @@ If your I/O manager is more complex, or needs to manage internal state, it may m In this case, we implement a stateful I/O manager which maintains a cache. -{/* TODO convert to */} -```python file=/concepts/io_management/custom_io_manager.py startafter=start_io_manager_factory_marker endbefore=end_io_manager_factory_marker -from dagster import IOManager, ConfigurableIOManagerFactory, OutputContext, InputContext -import requests - - -class ExternalIOManager(IOManager): - def __init__(self, api_token): - self._api_token = api_token - # setup stateful cache - self._cache = {} - - def handle_output(self, context: OutputContext, obj): ... - - def load_input(self, context: InputContext): - if context.asset_key in self._cache: - return self._cache[context.asset_key] - ... - - -class ConfigurableExternalIOManager(ConfigurableIOManagerFactory): - api_token: str - - def create_io_manager(self, context) -> ExternalIOManager: - return ExternalIOManager(self.api_token) -``` + ### Defining Pythonic I/O managers Pythonic I/O managers are defined as subclasses of , and similarly to [Pythonic resources](/guides/build/external-resources/) specify any configuration fields as attributes. Each subclass must implement a `handle_output` and `load_input` method, which are called by Dagster at runtime to handle the storing and loading of data. -{/* TODO convert to */} -```python file=/concepts/resources/pythonic_resources.py startafter=start_new_io_manager endbefore=end_new_io_manager dedent=4 -from dagster import ( - Definitions, - AssetKey, - OutputContext, - InputContext, - ConfigurableIOManager, -) - -class MyIOManager(ConfigurableIOManager): - root_path: str - - def _get_path(self, asset_key: AssetKey) -> str: - return self.root_path + "/".join(asset_key.path) - - def handle_output(self, context: OutputContext, obj): - write_csv(self._get_path(context.asset_key), obj) - - def load_input(self, context: InputContext): - return read_csv(self._get_path(context.asset_key)) - -defs = Definitions( - assets=..., - resources={"io_manager": MyIOManager(root_path="/tmp/")}, -) -``` +{/* TODO add dedent=4 prop to CodeExample below when implemented */} + ### Handling partitioned assets @@ -103,21 +36,7 @@ The default I/O manager has support for loading a partitioned upstream asset for To handle partitions in an custom I/O manager, you'll need to determine which partition you're dealing with when you're storing an output or loading an input. For this, and have a `asset_partition_key` property: -{/* TODO convert to */} -```python file=/concepts/io_management/custom_io_manager.py startafter=start_partitioned_marker endbefore=end_partitioned_marker -class MyPartitionedIOManager(IOManager): - def _get_path(self, context) -> str: - if context.has_partition_key: - return "/".join(context.asset_key.path + [context.asset_partition_key]) - else: - return "/".join(context.asset_key.path) - - def handle_output(self, context: OutputContext, obj): - write_csv(self._get_path(context), obj) - - def load_input(self, context: InputContext): - return read_csv(self._get_path(context)) -``` + If you're working with time window partitions, you can also use the `asset_partitions_time_window` property, which will return a object. @@ -127,48 +46,8 @@ A single partition of one asset might depend on a range of partitions of an upst The default I/O manager has support for loading multiple upstream partitions. In this case, the downstream asset should use `Dict[str, ...]` (or leave it blank) type for the upstream `DagsterType`. Here is an example of loading multiple upstream partitions using the default partition mapping: -{/* TODO convert to */} -```python file=/concepts/io_management/loading_multiple_upstream_partitions.py -from datetime import datetime -from typing import Dict - -import pandas as pd - -from dagster import ( - AssetExecutionContext, - DailyPartitionsDefinition, - HourlyPartitionsDefinition, - asset, - materialize, -) - -start = datetime(2022, 1, 1) - -hourly_partitions = HourlyPartitionsDefinition(start_date=f"{start:%Y-%m-%d-%H:%M}") -daily_partitions = DailyPartitionsDefinition(start_date=f"{start:%Y-%m-%d}") - -@asset(partitions_def=hourly_partitions) -def upstream_asset(context: AssetExecutionContext) -> pd.DataFrame: - return pd.DataFrame({"date": [context.partition_key]}) - - -@asset( - partitions_def=daily_partitions, -) -def downstream_asset(upstream_asset: dict[str, pd.DataFrame]) -> pd.DataFrame: - return pd.concat(list(upstream_asset.values())) - - -result = materialize( - [*upstream_asset.to_source_assets(), downstream_asset], - partition_key=start.strftime(daily_partitions.fmt), -) -downstream_asset_data = result.output_for_node("downstream_asset", "result") -assert ( - len(downstream_asset_data) == 24 -), "downstream day should map to upstream 24 hours" -``` + The `upstream_asset` becomes a mapping from partition keys to partition values. This is a property of the default I/O manager or any I/O manager inheriting from the . @@ -182,70 +61,8 @@ In some cases you may find that you need to load an input in a way other than th Since the method for loading an input is directly affected by the way the corresponding output was stored, we recommend defining your input managers as subclasses of existing I/O managers and just updating the `load_input` method. In this example, we load an input as a NumPy array rather than a Pandas DataFrame by writing the following: -{/* TODO convert to */} -```python file=/concepts/io_management/input_managers.py startafter=start_plain_input_manager endbefore=end_plain_input_manager -# in this case PandasIOManager is an existing IO Manager -class MyNumpyLoader(PandasIOManager): - def load_input(self, context: InputContext) -> np.ndarray: - file_path = "path/to/dataframe" - array = np.genfromtxt(file_path, delimiter=",", dtype=None) - return array - - -@op(ins={"np_array_input": In(input_manager_key="numpy_manager")}) -def analyze_as_numpy(np_array_input: np.ndarray): - assert isinstance(np_array_input, np.ndarray) - - -@job(resource_defs={"numpy_manager": MyNumpyLoader(), "io_manager": PandasIOManager()}) -def my_job(): - df = produce_pandas_output() - analyze_as_numpy(df) -``` + This may quickly run into issues if the owner of `PandasIOManager` changes the path at which they store outputs. We recommend splitting out path defining logic (or other computations shared by `handle_output` and `load_input`) into new methods that are called when needed. -{/* TODO convert to */} -```python file=/concepts/io_management/input_managers.py startafter=start_better_input_manager endbefore=end_better_input_manager -# this IO Manager is owned by a different team -class BetterPandasIOManager(ConfigurableIOManager): - def _get_path(self, output_context): - return os.path.join( - self.base_dir, - "storage", - f"{output_context.step_key}_{output_context.name}.csv", - ) - - def handle_output(self, context: OutputContext, obj: pd.DataFrame): - file_path = self._get_path(context) - os.makedirs(os.path.dirname(file_path), exist_ok=True) - if obj is not None: - obj.to_csv(file_path, index=False) - - def load_input(self, context: InputContext) -> pd.DataFrame: - return pd.read_csv(self._get_path(context.upstream_output)) - - -# write a subclass that uses _get_path for your custom loading logic -class MyBetterNumpyLoader(BetterPandasIOManager): - def load_input(self, context: InputContext) -> np.ndarray: - file_path = self._get_path(context.upstream_output) - array = np.genfromtxt(file_path, delimiter=",", dtype=None) - return array - - -@op(ins={"np_array_input": In(input_manager_key="better_numpy_manager")}) -def better_analyze_as_numpy(np_array_input: np.ndarray): - assert isinstance(np_array_input, np.ndarray) - - -@job( - resource_defs={ - "numpy_manager": MyBetterNumpyLoader(), - "io_manager": BetterPandasIOManager(), - } -) -def my_better_job(): - df = produce_pandas_output() - better_analyze_as_numpy(df) -``` + diff --git a/docs/docs-beta/docs/guides/build/partitions-and-backfills/backfilling-data.md b/docs/docs-beta/docs/guides/build/partitions-and-backfills/backfilling-data.md index 83397ff17ddc6..a8b01823e91d3 100644 --- a/docs/docs-beta/docs/guides/build/partitions-and-backfills/backfilling-data.md +++ b/docs/docs-beta/docs/guides/build/partitions-and-backfills/backfilling-data.md @@ -47,27 +47,5 @@ To get this behavior, you need to: Which property to use depends on whether it's most convenient for you to operate on start/end datetime objects, start/end partition keys, or a list of partition keys. + -```python file=/concepts/partitions_schedules_sensors/backfills/single_run_backfill_asset.py startafter=start_marker endbefore=end_marker -from dagster import ( - AssetExecutionContext, - AssetKey, - BackfillPolicy, - DailyPartitionsDefinition, - asset, -) - - -@asset( - partitions_def=DailyPartitionsDefinition(start_date="2020-01-01"), - backfill_policy=BackfillPolicy.single_run(), - deps=[AssetKey("raw_events")], -) -def events(context: AssetExecutionContext) -> None: - start_datetime, end_datetime = context.partition_time_window - - input_data = read_data_in_datetime_range(start_datetime, end_datetime) - output_data = compute_events_from_raw_events(input_data) - - overwrite_data_in_datetime_range(start_datetime, end_datetime, output_data) -``` diff --git a/docs/docs-beta/docs/guides/build/partitions-and-backfills/partitioning-assets.md b/docs/docs-beta/docs/guides/build/partitions-and-backfills/partitioning-assets.md index 5750c184e22b2..286ad6ea2b44b 100644 --- a/docs/docs-beta/docs/guides/build/partitions-and-backfills/partitioning-assets.md +++ b/docs/docs-beta/docs/guides/build/partitions-and-backfills/partitioning-assets.md @@ -54,8 +54,6 @@ Consider this example: -Because the partition values are unknown in advance, `DynamicPartitionsDefinition` is used to define the partition. Then, the `all_regions_sensor` TODO: incomplete sentence - In this example: - Because the partition values are unknown in advance, `DynamicPartitionsDefinition` is used to define `region_partitions` From 8de951f2836e937ed972b1b6f0b48763243c061a Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 00:16:58 -0700 Subject: [PATCH 5/9] update CONTRIBUTING Signed-off-by: nikki everett --- docs/docs-beta/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs-beta/CONTRIBUTING.md b/docs/docs-beta/CONTRIBUTING.md index a632448df8d06..dc6a84740eb47 100644 --- a/docs/docs-beta/CONTRIBUTING.md +++ b/docs/docs-beta/CONTRIBUTING.md @@ -129,7 +129,7 @@ To include code snippets, use the following format: ``` -You can optionally include [additional properties](https://github.com/dagster-io/dagster/blob/master/docs/docs-beta/src/components/CodeExample.tsx#L4), such as `language`, `title`, `lineStart`, `lineEnd`, `startAfter`, and `endBefore`: +You can optionally include [additional properties](https://github.com/dagster-io/dagster/blob/master/docs/docs-beta/src/components/CodeExample.tsx#L6), such as `language`, `title`, `lineStart`, `lineEnd`, `startAfter`, and `endBefore`: ``` From 0bff5e037978870c5851db247009b2e6a148d169 Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 10:07:46 -0700 Subject: [PATCH 6/9] convert deploy docs to CodeExample Signed-off-by: nikki everett --- .../deploy/dagster-instance-configuration.md | 378 ++---------------- .../guides/deploy/deployment-options/aws.md | 43 +- .../guides/deploy/deployment-options/dask.md | 61 +-- .../guides/deploy/deployment-options/gcp.md | 38 +- .../kubernetes/customizing-your-deployment.md | 143 +------ .../docs/guides/deploy/dev-to-prod.md | 271 +------------ .../docs/guides/deploy/execution/celery.md | 18 +- .../customizing-run-queue-priority.md | 15 +- .../docs/guides/deploy/execution/dask.md | 60 +-- .../guides/deploy/execution/run-monitoring.md | 26 +- .../guides/deploy/execution/run-retries.md | 15 +- ...using-environment-variables-and-secrets.md | 49 +-- 12 files changed, 63 insertions(+), 1054 deletions(-) diff --git a/docs/docs-beta/docs/guides/deploy/dagster-instance-configuration.md b/docs/docs-beta/docs/guides/deploy/dagster-instance-configuration.md index efd2392eeff71..fb8450765ff0c 100644 --- a/docs/docs-beta/docs/guides/deploy/dagster-instance-configuration.md +++ b/docs/docs-beta/docs/guides/deploy/dagster-instance-configuration.md @@ -92,21 +92,7 @@ Refer to the following tabs for available options and sample configuration. To use a SQLite database for storage, configure `storage.sqlite` in `dagster.yaml`: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_storage_sqlite endbefore=end_marker_storage_sqlite -# there are two ways to set storage to SqliteStorage - -# this config manually sets the directory (`base_dir`) for Sqlite to store data in: -storage: - sqlite: - base_dir: /path/to/dir - -# and this config grabs the directory from an environment variable -storage: - sqlite: - base_dir: - env: SQLITE_STORAGE_BASE_DIR -``` + @@ -121,46 +107,8 @@ To use Postgres storage, you'll need to install the [dagster-postgres](/api/pyth To use a [PostgreSQL database](/api/python-api/libraries/dagster-postgres) for storage, configure `storage.postgres` in `dagster.yaml`: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_storage_postgres endbefore=end_marker_storage_postgres -# Postgres storage can be set using either credentials or a connection string. This requires that -# the `dagster-postgres` library be installed and a database configured with UTC timezone. - -# this config manually sets the Postgres credentials -storage: - postgres: - postgres_db: - username: { DAGSTER_PG_USERNAME } - password: { DAGSTER_PG_PASSWORD } - hostname: { DAGSTER_PG_HOSTNAME } - db_name: { DAGSTER_PG_DB } - port: 5432 - -# and this config grabs the database credentials from environment variables -storage: - postgres: - postgres_db: - username: - env: DAGSTER_PG_USERNAME - password: - env: DAGSTER_PG_PASSWORD - hostname: - env: DAGSTER_PG_HOST - db_name: - env: DAGSTER_PG_DB - port: 5432 - -# and this config sets the credentials via DB connection string / url: -storage: - postgres: - postgres_url: { PG_DB_CONN_STRING } - -# This config gets the DB connection string / url via environment variables: -storage: - postgres: - postgres_url: - env: PG_DB_CONN_STRING -``` + + @@ -175,47 +123,8 @@ To use MySQL storage, you'll need to install the [dagster-mysql](/api/python-api To use a [MySQL database](/api/python-api/libraries/dagster-mysql) for storage, configure `storage.mysql` in `dagster.yaml`: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_storage_mysql endbefore=end_marker_storage_mysql -# MySQL storage can be set using either credentials or a connection string. This requires that the -# `dagster-mysql` library be installed. - -# this config manually sets the MySQL credentials -storage: - mysql: - mysql_db: - username: { DAGSTER_MYSQL_USERNAME } - password: { DAGSTER_MYSQL_PASSWORD } - hostname: { DAGSTER_MYSQL_HOSTNAME } - db_name: { DAGSTER_MYSQL_DB } - port: 3306 - - -# and this config grabs the database credentials from environment variables -storage: - mysql: - mysql_db: - username: - env: DAGSTER_MYSQL_USERNAME - password: - env: DAGSTER_MYSQL_PASSWORD - hostname: - env: DAGSTER_MYSQL_HOSTNAME - db_name: - env: DAGSTER_MYSQL_DB - port: 3306 - -# and this config sets the credentials via DB connection string / url: -storage: - mysql: - mysql_url: { MYSQL_DB_CONN_STRING } - -# this config grabs the MySQL connection string from environment variables -storage: - mysql: - mysql_url: - env: MYSQL_DB_CONN_STRING -``` + + @@ -233,12 +142,8 @@ Refer to the following tabs for available options and sample configuration. Keep The spawns a new process in the same node as a job's code location. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_run_launcher_default endbefore=end_marker_run_launcher_default -run_launcher: - module: dagster.core.launcher - class: DefaultRunLauncher -``` + + @@ -247,12 +152,8 @@ run_launcher: The allocates a Docker container per run. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_run_launcher_docker endbefore=end_marker_run_launcher_docker -run_launcher: - module: dagster_docker - class: DockerRunLauncher -``` + + @@ -261,34 +162,7 @@ run_launcher: The allocates a Kubernetes job per run. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_run_launcher_k8s endbefore=end_marker_run_launcher_k8s -# there are multiple ways to configure the K8sRunLauncher - -# you can set the follow configuration values directly -run_launcher: - module: dagster_k8s.launcher - class: K8sRunLauncher - config: - service_account_name: pipeline_run_service_account - job_image: my_project/dagster_image:latest - instance_config_map: dagster-instance - postgres_password_secret: dagster-postgresql-secret - -# alternatively, you can grab any of these config values from environment variables: -run_launcher: - module: dagster_k8s.launcher - class: K8sRunLauncher - config: - service_account_name: - env: PIPELINE_RUN_SERVICE_ACCOUNT - job_image: - env: DAGSTER_IMAGE_NAME - instance_config_map: - env: DAGSTER_INSTANCE_CONFIG_MAP - postgres_password_secret: - env: DAGSTER_POSTGRES_SECRET -``` + @@ -306,14 +180,8 @@ Refer to the following tabs for available options and sample configuration. The default run coordinator, the immediately sends runs to the [run launcher](#run-launcher). There isn't a notion of `Queued` runs. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_run_coordinator_default endbefore=end_marker_run_coordinator_default -# Since DefaultRunCoordinator is the default option, omitting the `run_coordinator` key will also suffice, -# but if you would like to set it explicitly: -run_coordinator: - module: dagster.core.run_coordinator - class: DefaultRunCoordinator -``` + + @@ -324,52 +192,8 @@ The */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_run_coordinator_queued endbefore=end_marker_run_coordinator_queued -# There are a few ways to configure the QueuedRunCoordinator: - -# this first option has concurrency limits set to default values -run_coordinator: - module: dagster.core.run_coordinator - class: QueuedRunCoordinator - -# this second option manually specifies limits: -run_coordinator: - module: dagster.core.run_coordinator - class: QueuedRunCoordinator - config: - max_concurrent_runs: 25 - tag_concurrency_limits: - - key: "database" - value: "redshift" - limit: 4 - - key: "dagster/backfill" - limit: 10 - -# as always, some or all of these values can be obtained from environment variables: -run_coordinator: - module: dagster.core.run_coordinator - class: QueuedRunCoordinator - config: - max_concurrent_runs: - env: DAGSTER_OVERALL_CONCURRENCY_LIMIT - tag_concurrency_limits: - - key: "database" - value: "redshift" - limit: - env: DAGSTER_REDSHIFT_CONCURRENCY_LIMIT - - key: "dagster/backfill" - limit: - env: DAGSTER_BACKFILL_CONCURRENCY_LIMIT - -# for higher dequeue throughput, threading can be enabled: -run_coordinator: - module: dagster.core.run_coordinator - class: QueuedRunCoordinator - config: - dequeue_use_threads: true - dequeue_num_workers: 8 -``` + + @@ -387,26 +211,8 @@ Refer to the following tabs for available options and sample configuration. Used by default, the writes `stdout` and `stderr` logs to disk. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_compute_log_storage_local endbefore=end_marker_compute_log_storage_local -# there are two ways to set the directory that the LocalComputeLogManager writes -# stdout & stderr logs to - -# You could directly set the `base_dir` key -compute_logs: - module: dagster.core.storage.local_compute_log_manager - class: LocalComputeLogManager - config: - base_dir: /path/to/directory - -# Alternatively, you could set the `base_dir` key to an environment variable -compute_logs: - module: dagster.core.storage.local_compute_log_manager - class: LocalComputeLogManager - config: - base_dir: - env: LOCAL_COMPUTE_LOG_MANAGER_DIRECTORY -``` + + @@ -415,12 +221,7 @@ compute_logs: The does not store `stdout` and `stderr` logs for any step. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_compute_log_storage_noop endbefore=end_marker_compute_log_storage_noop -compute_logs: - module: dagster.core.storage.noop_compute_log_manager - class: NoOpComputeLogManager -``` + @@ -429,42 +230,7 @@ compute_logs: The writes `stdout` and `stderr` to Azure Blob Storage. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_compute_log_storage_blob endbefore=end_marker_compute_log_storage_blob -# there are multiple ways to configure the AzureBlobComputeLogManager - -# you can set the necessary configuration values directly: -compute_logs: - module: dagster_azure.blob.compute_log_manager - class: AzureBlobComputeLogManager - config: - storage_account: mycorp-dagster - container: compute-logs - secret_credential: - client_id: ... - tenant_id: ... - client_secret: ... - local_dir: /tmp/bar - prefix: dagster-test- - -# alternatively, you can obtain any of these config values from environment variables -compute_logs: - module: dagster_azure.blob.compute_log_manager - class: AzureBlobComputeLogManager - config: - storage_account: - env: MYCORP_DAGSTER_STORAGE_ACCOUNT_NAME - container: - env: CONTAINER_NAME - secret_credential: - client_id: ... - tenant_id: ... - client_secret: ... - local_dir: - env: LOCAL_DIR_PATH - prefix: - env: DAGSTER_COMPUTE_LOG_PREFIX -``` + @@ -473,28 +239,7 @@ compute_logs: The writes `stdout` and `stderr` to Google Cloud Storage. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_compute_log_storage_gcs endbefore=end_marker_compute_log_storage_gcs -# there are multiple ways to configure the GCSComputeLogManager - -# you can set the necessary configuration values directly: -compute_logs: - module: dagster_gcp.gcs.compute_log_manager - class: GCSComputeLogManager - config: - bucket: mycorp-dagster-compute-logs - prefix: dagster-test- - -# alternatively, you can obtain any of these config values from environment variables -compute_logs: - module: dagster_gcp.gcs.compute_log_manager - class: GCSComputeLogManager - config: - bucket: - env: MYCORP_DAGSTER_COMPUTE_LOGS_BUCKET - prefix: - env: DAGSTER_COMPUTE_LOG_PREFIX -``` + @@ -503,28 +248,7 @@ compute_logs: The writes `stdout` and `stderr` to an Amazon Web Services S3 bucket. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_compute_log_storage_s3 endbefore=end_marker_compute_log_storage_s3 -# there are multiple ways to configure the S3ComputeLogManager - -# you can set the config values directly: -compute_logs: - module: dagster_aws.s3.compute_log_manager - class: S3ComputeLogManager - config: - bucket: "mycorp-dagster-compute-logs" - prefix: "dagster-test-" - -# or grab some or all of them from environment variables -compute_logs: - module: dagster_aws.s3.compute_log_manager - class: S3ComputeLogManager - config: - bucket: - env: MYCORP_DAGSTER_COMPUTE_LOGS_BUCKET - prefix: - env: DAGSTER_COMPUTE_LOG_PREFIX -``` + @@ -542,36 +266,13 @@ The `local_artifact_storage` key allows you to configure local artifact storage. ::: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_local_artifact_storage endbefore=end_marker_local_artifact_storage -# there are two possible ways to configure LocalArtifactStorage - -# example local_artifact_storage setup pointing to /var/shared/dagster directory -local_artifact_storage: - module: dagster.core.storage.root - class: LocalArtifactStorage - config: - base_dir: "/path/to/dir" - -# alternatively, `base_dir` can be set to an environment variable -local_artifact_storage: - module: dagster.core.storage.root - class: LocalArtifactStorage - config: - base_dir: - env: DAGSTER_LOCAL_ARTIFACT_STORAGE_DIR -``` + ### Telemetry The `telemetry` key allows you to opt in or out of Dagster collecting anonymized usage statistics. This is set to `true` by default. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_telemetry endbefore=end_marker_telemetry -# Allows opting out of Dagster collecting usage statistics. -telemetry: - enabled: false -``` + For more information, see the [Telemetry documentation](/about/telemetry). @@ -583,13 +284,7 @@ When you aren't [running your own gRPC server](/guides/deploy/code-locations/wor If you expect that your code will take longer than 180 seconds to load, set the `code_servers.local_startup_timeout` key. The value should be an integer that indicates the maximum timeout, in seconds. -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_code_servers endbefore=end_marker_code_servers -# Configures how long Dagster waits for code locations -# to load before timing out. -code_servers: - local_startup_timeout: 360 -``` + ### Data retention @@ -597,18 +292,7 @@ The `retention` key allows you to configure how long Dagster retains certain typ By default, Dagster retains skipped sensor ticks for seven days and all other tick types indefinitely. To customize the retention policies for schedule and sensor ticks, use the `purge_after_days` key: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_retention endbefore=end_marker_retention -# Configures how long Dagster keeps sensor / schedule tick data -retention: - schedule: - purge_after_days: 90 # sets retention policy for schedule ticks of all types - sensor: - purge_after_days: - skipped: 7 - failure: 30 - success: -1 # keep success ticks indefinitely -``` + The `purge_after_days` key accepts either: @@ -619,12 +303,7 @@ The `purge_after_days` key accepts either: The `sensors` key allows you to configure how sensors are evaluated. To evaluate multiple sensors in parallel simultaneously, set the `use_threads` and `num_workers` keys: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_sensors endbefore=end_marker_sensors -sensors: - use_threads: true - num_workers: 8 -``` + You can also set the optional `num_submit_workers` key to evaluate multiple run requests from the same sensor tick in parallel, which can help decrease latency when a single sensor tick returns many run requests. @@ -634,11 +313,6 @@ The `schedules` key allows you to configure how schedules are evaluated. By defa To evaluate multiple schedules in parallel simultaneously, set the `use_threads` and `num_workers` keys: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_marker_schedules endbefore=end_marker_schedules -schedules: - use_threads: true - num_workers: 8 -``` + You can also set the optional `num_submit_workers` key to evaluate multiple run requests from the same schedule tick in parallel, which can help decrease latency when a single schedule tick returns many run requests. diff --git a/docs/docs-beta/docs/guides/deploy/deployment-options/aws.md b/docs/docs-beta/docs/guides/deploy/deployment-options/aws.md index 622ff4242c342..885e9df1e5c18 100644 --- a/docs/docs-beta/docs/guides/deploy/deployment-options/aws.md +++ b/docs/docs-beta/docs/guides/deploy/deployment-options/aws.md @@ -14,16 +14,7 @@ To host Dagster on a bare VM or in Docker on EC2, see "[Running Dagster as a ser You can use a hosted RDS PostgreSQL database for your Dagster run/events data by configuring your `dagster.yaml` file: -```python file=/deploying/dagster-pg.yaml -storage: - postgres: - postgres_db: - username: my_username - password: my_password - hostname: my_hostname - db_name: my_database - port: 5432 -``` + In this case, you'll want to ensure that: @@ -212,39 +203,11 @@ To enable parallel computation (e.g., with the multiprocessing or Dagster celery You'll need to use as your I/O Manager or customize your own persistent I/O managers. Refer to the [I/O managers documentation](/guides/build/io-managers/) for an example. -{/* TODO convert to */} -```python file=/deploying/aws/io_manager.py -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_aws.s3.resources import s3_resource - -from dagster import Int, Out, job, op - - -@op(out=Out(Int)) -def my_op(): - return 1 - - -@job( - resource_defs={ - "io_manager": s3_pickle_io_manager, - "s3": s3_resource, - } -) -def my_job(): - my_op() -``` + Then, add the following YAML block in your job's config: -{/* TODO convert to */} -```yaml file=/deploying/aws/io_manager.yaml -resources: - io_manager: - config: - s3_bucket: my-cool-bucket - s3_prefix: good/prefix-for-files- -``` + The resource uses `boto` under the hood. If you're accessing your private buckets, you'll need to provide the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables or follow [one of the other boto authentication methods](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials). diff --git a/docs/docs-beta/docs/guides/deploy/deployment-options/dask.md b/docs/docs-beta/docs/guides/deploy/deployment-options/dask.md index 799a8f3983493..9f31f3d297a30 100644 --- a/docs/docs-beta/docs/guides/deploy/deployment-options/dask.md +++ b/docs/docs-beta/docs/guides/deploy/deployment-options/dask.md @@ -23,32 +23,11 @@ First, run `pip install dagster-dask`. Then, create a job with the dask executor: -{/* TODO convert to */} -```python file=/deploying/dask_hello_world.py startafter=start_local_job_marker endbefore=end_local_job_marker -from dagster_dask import dask_executor - -from dagster import job, op - - -@op -def hello_world(): - return "Hello, World!" - - -@job(executor_def=dask_executor) -def local_dask_job(): - hello_world() -``` + Now you can run this job with a config block such as the following: -{/* TODO convert to */} -```python file=/deploying/dask_hello_world.yaml -execution: - config: - cluster: - local: -``` + Executing this job will spin up local Dask execution, run the job, and exit. @@ -58,43 +37,11 @@ If you want to use a Dask cluster for distributed execution, you will first need You'll also need an IO manager that uses persistent shared storage, which should be attached to the job along with any resources on which it depends. Here, we use the : -{/* TODO convert to */} - -```python file=/deploying/dask_hello_world_distributed.py startafter=start_distributed_job_marker endbefore=end_distributed_job_marker -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_aws.s3.resources import s3_resource -from dagster_dask import dask_executor - -from dagster import job, op - - -@op -def hello_world(): - return "Hello, World!" - - -@job( - executor_def=dask_executor, - resource_defs={"io_manager": s3_pickle_io_manager, "s3": s3_resource}, -) -def distributed_dask_job(): - hello_world() -``` + For distributing task execution on a Dask cluster, you must provide a config block that includes the address/port of the Dask scheduler: -{/* TODO convert to */} -```python file=/deploying/dask_remote.yaml -resources: - io_manager: - config: - s3_bucket: your_bucket_name -execution: - config: - cluster: - existing: - address: "dask_scheduler.dns_name:8787" -``` + Since Dask will invoke your job code on the cluster workers, you must ensure that the latest version of your Python code is available to all of the Dask workers. Ideally, you'll package this as a Python module, and target your `workspace.yaml` at this module. diff --git a/docs/docs-beta/docs/guides/deploy/deployment-options/gcp.md b/docs/docs-beta/docs/guides/deploy/deployment-options/gcp.md index c9e4c3f1d56f5..53b8c9ebdf121 100644 --- a/docs/docs-beta/docs/guides/deploy/deployment-options/gcp.md +++ b/docs/docs-beta/docs/guides/deploy/deployment-options/gcp.md @@ -14,16 +14,7 @@ To host the Dagster webserver or Dagster daemon on a bare VM or in Docker on GCE We recommend launching a Cloud SQL PostgreSQL instance for run and events data. You can configure the webserver to use Cloud SQL to run and events data by setting blocks in your `$DAGSTER_HOME/dagster.yaml` appropriately: -```python file=/deploying/dagster-pg.yaml -storage: - postgres: - postgres_db: - username: my_username - password: my_password - hostname: my_hostname - db_name: my_database - port: 5432 -``` + In this case, you'll want to ensure you provide the right connection strings for your Cloud SQL instance, and that the node or container hosting the webserver is able to connect to Cloud SQL. @@ -37,31 +28,6 @@ You'll probably also want to configure a GCS bucket to store op outputs via pers You'll first need to need to create a job using as its IO Manager (or [define a custom IO Manager](/guides/build/io-managers/defining-a-custom-io-manager)): -{/* TODO convert to */} -```python file=/deploying/gcp/gcp_job.py -from dagster_gcp.gcs.io_manager import gcs_pickle_io_manager -from dagster_gcp.gcs.resources import gcs_resource - -from dagster import job - - -@job( - resource_defs={ - "gcs": gcs_resource, - "io_manager": gcs_pickle_io_manager, - }, - config={ - "resources": { - "io_manager": { - "config": { - "gcs_bucket": "my-cool-bucket", - "gcs_prefix": "good/prefix-for-files-", - } - } - } - }, -) -def gcs_job(): ... -``` + With this in place, your job runs will store outputs on GCS in the location `gs:///dagster/storage//files/.compute`. diff --git a/docs/docs-beta/docs/guides/deploy/deployment-options/kubernetes/customizing-your-deployment.md b/docs/docs-beta/docs/guides/deploy/deployment-options/kubernetes/customizing-your-deployment.md index e4fc3a85873d1..c48278e58e5c8 100644 --- a/docs/docs-beta/docs/guides/deploy/deployment-options/kubernetes/customizing-your-deployment.md +++ b/docs/docs-beta/docs/guides/deploy/deployment-options/kubernetes/customizing-your-deployment.md @@ -26,30 +26,7 @@ Refer to the [Kubernetes documentation](https://kubernetes.io/docs/home/) for mo The value for each of these keys is a dictionary with the YAML configuration for the underlying Kubernetes object. The Kubernetes object fields can be configured using either snake case (for example, `volume_mounts`) or camel case (`volumeMounts`). For example: -{/* TODO convert to */} -```yaml file=/deploying/kubernetes/run_k8s_config.yaml -runLauncher: - type: K8sRunLauncher - config: - k8sRunLauncher: - runK8sConfig: - containerConfig: # raw config for the pod's main container - resources: - limits: - cpu: 100m - memory: 128Mi - podTemplateSpecMetadata: # raw config for the pod's metadata - annotations: - mykey: myvalue - podSpecConfig: # raw config for the spec of the launched's pod - nodeSelector: - disktype: ssd - jobSpecConfig: # raw config for the kubernetes job's spec - ttlSecondsAfterFinished: 7200 - jobMetadata: # raw config for the kubernetes job's metadata - annotations: - mykey: myvalue -``` + If your Dagster job is configured with the that runs each step in its own pod, configuration that you set in `runK8sConfig` will also be propagated to the pods that are created for each step, unless that step's configuration is overridden using one of the methods below. @@ -69,49 +46,7 @@ Refer to the [Kubernetes documentation](https://kubernetes.io/docs/home/) for mo The value for each of these keys is a dictionary with the YAML configuration for the underlying Kubernetes object. The Kubernetes object fields can be configured using either snake case (for example, `volume_mounts`) or camel case (`volumeMounts`). For example: -{/* TODO convert to */} -```python file=/deploying/kubernetes/k8s_config_tag_job.py startafter=start_k8s_config endbefore=end_k8s_config -@job( - tags={ - "dagster-k8s/config": { - "container_config": { - "resources": { - "requests": {"cpu": "250m", "memory": "64Mi"}, - "limits": {"cpu": "500m", "memory": "2560Mi"}, - }, - "volume_mounts": [ - {"name": "volume1", "mount_path": "foo/bar", "sub_path": "file.txt"} - ], - }, - "pod_template_spec_metadata": { - "annotations": {"cluster-autoscaler.kubernetes.io/safe-to-evict": "true"} - }, - "pod_spec_config": { - "volumes": [{"name": "volume1", "secret": {"secret_name": "volume_secret_name"}}], - "affinity": { - "node_affinity": { - "required_during_scheduling_ignored_during_execution": { - "node_selector_terms": [ - { - "match_expressions": [ - { - "key": "beta.kubernetes.io/os", - "operator": "In", - "values": ["windows", "linux"], - } - ] - } - ] - } - } - }, - }, - }, - }, -) -def my_job(): - my_op() -``` + Other run launchers will ignore the `dagster-k8s/config` tag. @@ -135,24 +70,7 @@ Refer to the [Kubernetes documentation](https://kubernetes.io/docs/home/) for mo The value for each of these keys is a dictionary with the YAML configuration for the underlying Kubernetes object. The Kubernetes object fields can be configured using either snake case (for example, `volume_mounts`) or camel case (`volumeMounts`). For example: -{/* TODO convert to */} -```python file=/deploying/kubernetes/step_k8s_config.py startafter=start_step_k8s_config endbefore=end_step_k8s_config -my_k8s_executor = k8s_job_executor.configured( - { - "step_k8s_config": { - "container_config": { - "resources": { - "requests": {"cpu": "200m", "memory": "32Mi"}, - } - } - } - } -) - -@job(executor_def=my_k8s_executor) -def my_job(): - ... -``` + ### Kubernetes configuration on individual steps in a run @@ -172,47 +90,11 @@ The value for each of these keys is a dictionary with the YAML configuration for For example, for an asset: -{/* TODO convert to */} -```python file=/deploying/kubernetes/k8s_config_tag_asset.py startafter=start_k8s_config endbefore=end_k8s_config -@asset( - op_tags={ - "dagster-k8s/config": { - "container_config": { - "resources": { - "requests": {"cpu": "200m", "memory": "32Mi"}, - } - }, - } - } -) -def my_asset(context: AssetExecutionContext): - context.log.info("running") - -my_job = define_asset_job(name="my_job", selection="my_asset", executor_def=k8s_job_executor) -``` + or an op: -{/* TODO convert to */} -```python file=/deploying/kubernetes/k8s_config_tag_op.py startafter=start_k8s_config endbefore=end_k8s_config -@op( - tags={ - "dagster-k8s/config": { - "container_config": { - "resources": { - "requests": {"cpu": "200m", "memory": "32Mi"}, - } - }, - } - } -) -def my_op(context: OpExecutionContext): - context.log.info("running") - -@job(executor_def=k8s_job_executor) -def my_job(): - my_op() -``` + Other executors will ignore the `dagster-k8s/config` tag when it is set on an op or asset. @@ -379,20 +261,7 @@ helm upgrade --install user-code dagster/dagster-user-deployments -f /path/to/va If you use a Kubernetes distribution that supports the [TTL Controller](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/#ttl-controller), then `Completed` and `Failed` [Jobs](https://kubernetes.io/docs/concepts/workloads/controllers/job/) (and their associated [Pods](https://kubernetes.io/docs/concepts/workloads/pods/)) will be deleted after 1 day. The TTL value can be modified in your job tags: -{/* TODO convert to */} -```python file=/deploying/kubernetes/ttl_config_job.py startafter=start_ttl endbefore=end_ttl -@job( - tags = { - 'dagster-k8s/config': { - 'job_spec_config': { - 'ttl_seconds_after_finished': 7200 - } - } - } -) -def my_job(): - my_op() -``` + If you do not use a Kubernetes distribution that supports the [TTL Controller](https://kubernetes.io/docs/concepts/workloads/controllers/ttlafterfinished/#ttl-controller), then you can run the following commands: diff --git a/docs/docs-beta/docs/guides/deploy/dev-to-prod.md b/docs/docs-beta/docs/guides/deploy/dev-to-prod.md index 77ccedb204504..0c7fb4b28c7b7 100644 --- a/docs/docs-beta/docs/guides/deploy/dev-to-prod.md +++ b/docs/docs-beta/docs/guides/deploy/dev-to-prod.md @@ -29,7 +29,7 @@ Using these Dagster concepts we will: :::tip -You can find the code for this example on [GitHub](https://github.com/dagster-io/dagster/tree/1.9.8/examples/development_to_production/). +You can find the code for this example on [GitHub](https://github.com/dagster-io/dagster/tree/master/examples/development_to_production/). ::: @@ -51,79 +51,11 @@ In this section we will: Let’s start by writing our three assets. We'll use Pandas DataFrames to interact with the data. -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/assets.py startafter=start_assets endbefore=end_assets -# assets.py -import pandas as pd -import requests - -from dagster import Config, asset - - -class ItemsConfig(Config): - base_item_id: int - - -@asset( - io_manager_key="snowflake_io_manager", -) -def items(config: ItemsConfig) -> pd.DataFrame: - """Items from the Hacker News API: each is a story or a comment on a story.""" - rows = [] - max_id = requests.get( - "https://hacker-news.firebaseio.com/v0/maxitem.json", timeout=5 - ).json() - # Hacker News API is 1-indexed, so adjust range by 1 - for item_id in range(max_id - config.base_item_id + 1, max_id + 1): - item_url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json" - rows.append(requests.get(item_url, timeout=5).json()) - - # ITEM_FIELD_NAMES is a list of the column names in the Hacker News dataset - result = pd.DataFrame(rows, columns=ITEM_FIELD_NAMES).drop_duplicates(subset=["id"]) - result.rename(columns={"by": "user_id"}, inplace=True) - return result - - -@asset( - io_manager_key="snowflake_io_manager", -) -def comments(items: pd.DataFrame) -> pd.DataFrame: - """Comments from the Hacker News API.""" - return items[items["type"] == "comment"] - - -@asset( - io_manager_key="snowflake_io_manager", -) -def stories(items: pd.DataFrame) -> pd.DataFrame: - """Stories from the Hacker News API.""" - return items[items["type"] == "story"] -``` + Now we can add these assets to our object and materialize them via the UI as part of our local development workflow. We can pass in credentials to our `SnowflakePandasIOManager`. -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/repository/repository_v1.py startafter=start endbefore=end -# definitions.py -from dagster_snowflake_pandas import SnowflakePandasIOManager - -from dagster import Definitions -from development_to_production.assets.hacker_news_assets import comments, items, stories - -# Note that storing passwords in configuration is bad practice. It will be resolved later in the guide. -resources = { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="me@company.com", - # password in config is bad practice - password="my_super_secret_password", - database="LOCAL", - schema="ALICE", - ), -} - -defs = Definitions(assets=[items, comments, stories], resources=resources) -``` + Note that we have passwords in our configuration in this code snippet. This is bad practice, and we will resolve it shortly. @@ -152,39 +84,7 @@ We want to store the assets in a production Snowflake database, so we need to up Instead, we can determine the configuration for resources based on the environment: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/repository/repository_v2.py startafter=start endbefore=end -# definitions.py - -# Note that storing passwords in configuration is bad practice. It will be resolved soon. -resources = { - "local": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="me@company.com", - # password in config is bad practice - password="my_super_secret_password", - database="LOCAL", - schema="ALICE", - ), - }, - "production": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="dev@company.com", - # password in config is bad practice - password="company_super_secret_password", - database="PRODUCTION", - schema="HACKER_NEWS", - ), - }, -} -deployment_name = os.getenv("DAGSTER_DEPLOYMENT", "local") - -defs = Definitions( - assets=[items, comments, stories], resources=resources[deployment_name] -) -``` + Note that we still have passwords in our configuration in this code snippet. This is bad practice, and we will resolve it next. @@ -197,37 +97,7 @@ We still have some problems with this setup: We can easily solve these problems using , which lets us source configuration for resources from environment variables. This allows us to store Snowflake configuration values as environment variables and point the I/O manager to those environment variables: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/repository/repository_v3.py startafter=start endbefore=end -# definitions.py - - -resources = { - "local": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user=EnvVar("DEV_SNOWFLAKE_USER"), - password=EnvVar("DEV_SNOWFLAKE_PASSWORD"), - database="LOCAL", - schema=EnvVar("DEV_SNOWFLAKE_SCHEMA"), - ), - }, - "production": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="system@company.com", - password=EnvVar("SYSTEM_SNOWFLAKE_PASSWORD"), - database="PRODUCTION", - schema="HACKER_NEWS", - ), - }, -} -deployment_name = os.getenv("DAGSTER_DEPLOYMENT", "local") - -defs = Definitions( - assets=[items, comments, stories], resources=resources[deployment_name] -) -``` + ### Staging @@ -237,22 +107,7 @@ Depending on your organization’s Dagster setup, there are a couple of options - **For a self-hosted staging deployment**, we’ve already done most of the necessary work to run our assets in staging! All we need to do is add another entry to the `resources` dictionary and set `DAGSTER_DEPLOYMENT=staging` in our staging deployment. -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/repository/repository_v3.py startafter=start_staging endbefore=end_staging -resources = { - "local": {...}, - "production": {...}, - "staging": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="system@company.com", - password=EnvVar("SYSTEM_SNOWFLAKE_PASSWORD"), - database="STAGING", - schema="HACKER_NEWS", - ), - }, -} -``` + ## Advanced: Unit tests with stubs and mocks @@ -275,64 +130,11 @@ Determining when it makes sense to stub a resource for a unit test can be a topi We'll start by writing the "real" Hacker News API Client: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/resources/resources_v1.py startafter=start_resource endbefore=end_resource -# resources.py -from typing import Any, Dict, Optional - -import requests - -from dagster import ConfigurableResource - - -class HNAPIClient(ConfigurableResource): - """Hacker News client that fetches live data.""" - - def fetch_item_by_id(self, item_id: int) -> Optional[dict[str, Any]]: - """Fetches a single item from the Hacker News API by item id.""" - item_url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json" - item = requests.get(item_url, timeout=5).json() - return item - - def fetch_max_item_id(self) -> int: - return requests.get( - "https://hacker-news.firebaseio.com/v0/maxitem.json", timeout=5 - ).json() - - @property - def item_field_names(self) -> list: - # omitted for brevity, see full code example for implementation - return [] -``` + We'll also need to update the `items` asset to use this client as a resource: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/assets_v2.py startafter=start_items endbefore=end_items -# assets.py - - -class ItemsConfig(Config): - base_item_id: int - - -@asset( - io_manager_key="snowflake_io_manager", -) -def items(config: ItemsConfig, hn_client: HNAPIClient) -> pd.DataFrame: - """Items from the Hacker News API: each is a story or a comment on a story.""" - max_id = hn_client.fetch_max_item_id() - rows = [] - # Hacker News API is 1-indexed, so adjust range by 1 - for item_id in range(max_id - config.base_item_id + 1, max_id + 1): - rows.append(hn_client.fetch_item_by_id(item_id)) - - result = pd.DataFrame(rows, columns=hn_client.item_field_names).drop_duplicates( - subset=["id"] - ) - result.rename(columns={"by": "user_id"}, inplace=True) - return result -``` + :::note @@ -342,45 +144,11 @@ For the sake of brevity, we've omitted the implementation of the property `item_ We'll also need to add an instance of `HNAPIClient` to `resources` in our `Definitions` object. -```python file=/guides/dagster/development_to_production/repository/repository_v3.py startafter=start_hn_resource endbefore=end_hn_resource -resource_defs = { - "local": {"hn_client": HNAPIClient(), "snowflake_io_manager": {...}}, - "production": {"hn_client": HNAPIClient(), "snowflake_io_manager": {...}}, - "staging": {"hn_client": HNAPIClient(), "snowflake_io_manager": {...}}, -} -``` + Now we can write a stubbed version of the Hacker News resource. We want to make sure the stub has implementations for each method `HNAPIClient` implements. -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/resources/resources_v2.py startafter=start_mock endbefore=end_mock -# resources.py - - -class StubHNClient: - """Hacker News Client that returns fake data.""" - - def __init__(self): - self.data = { - 1: { - "id": 1, - "type": "comment", - "title": "the first comment", - "by": "user1", - }, - 2: {"id": 2, "type": "story", "title": "an awesome story", "by": "user2"}, - } - - def fetch_item_by_id(self, item_id: int) -> Optional[dict[str, Any]]: - return self.data.get(item_id) - - def fetch_max_item_id(self) -> int: - return 2 - - @property - def item_field_names(self) -> list: - return ["id", "type", "title", "by"] -``` + :::note @@ -390,24 +158,7 @@ Since the stub Hacker News resource and the real Hacker News resource need to im Now we can use the stub Hacker News resource to test that the `items` asset transforms the data in the way we expect: -{/* TODO convert to */} -```python file=/guides/dagster/development_to_production/test_assets.py startafter=start endbefore=end -# test_assets.py - - -def test_items(): - hn_dataset = items( - config=ItemsConfig(base_item_id=StubHNClient().fetch_max_item_id()), - hn_client=StubHNClient(), - ) - assert isinstance(hn_dataset, pd.DataFrame) - - expected_data = pd.DataFrame(StubHNClient().data.values()).rename( - columns={"by": "user_id"} - ) - - assert (hn_dataset == expected_data).all().all() -``` + :::note diff --git a/docs/docs-beta/docs/guides/deploy/execution/celery.md b/docs/docs-beta/docs/guides/deploy/execution/celery.md index b442048acc3a0..b59c723f0017a 100644 --- a/docs/docs-beta/docs/guides/deploy/execution/celery.md +++ b/docs/docs-beta/docs/guides/deploy/execution/celery.md @@ -29,23 +29,7 @@ To demonstrate, we'll start by constructing a parallel toy job that uses the Cel In your Dagster project, create a new file named `celery_job.py` and paste in the following: -{/* TODO convert to */} -```python file=/deploying/celery_job.py -from dagster_celery import celery_executor - -from dagster import job, op - - -@op -def not_much(): - return - - -@job(executor_def=celery_executor) -def parallel_job(): - for i in range(50): - not_much.alias("not_much_" + str(i))() -``` + Now, run the Celery executor. In our case, we're running RabbitMQ as our broker. With Docker, this is something like the following: diff --git a/docs/docs-beta/docs/guides/deploy/execution/customizing-run-queue-priority.md b/docs/docs-beta/docs/guides/deploy/execution/customizing-run-queue-priority.md index c3f6f50eef759..3b44a6520be4d 100644 --- a/docs/docs-beta/docs/guides/deploy/execution/customizing-run-queue-priority.md +++ b/docs/docs-beta/docs/guides/deploy/execution/customizing-run-queue-priority.md @@ -43,20 +43,7 @@ When defining a priority value, note that: In this example, the priority is set to `-1` with a `dagster/priority` tag value of `"-1"`: -{/* TODO convert to */} -```python startafter=start_marker_priority endbefore=end_marker_priority file=/deploying/concurrency_limits/concurrency_limits.py -@job(tags={"dagster/priority": "3"}) -def important_job(): ... - - -@schedule( - cron_schedule="* * * * *", - job_name="important_job", - execution_timezone="US/Central", - tags={"dagster/priority": "-1"}, -) -def less_important_schedule(_): ... -``` + diff --git a/docs/docs-beta/docs/guides/deploy/execution/dask.md b/docs/docs-beta/docs/guides/deploy/execution/dask.md index a344a223ef83e..ade32fd1302dc 100644 --- a/docs/docs-beta/docs/guides/deploy/execution/dask.md +++ b/docs/docs-beta/docs/guides/deploy/execution/dask.md @@ -23,32 +23,11 @@ First, run `pip install dagster-dask`. Then, create a job with the dask executor: -{/* TODO convert to */} -```python file=/deploying/dask_hello_world.py startafter=start_local_job_marker endbefore=end_local_job_marker -from dagster_dask import dask_executor - -from dagster import job, op - - -@op -def hello_world(): - return "Hello, World!" - - -@job(executor_def=dask_executor) -def local_dask_job(): - hello_world() -``` + Now you can run this job with a config block such as the following: -{/* TODO convert to */} -```python file=/deploying/dask_hello_world.yaml -execution: - config: - cluster: - local: -``` + Executing this job will spin up local Dask execution, run the job, and exit. @@ -58,42 +37,11 @@ If you want to use a Dask cluster for distributed execution, you will first need You'll also need an IO manager that uses persistent shared storage, which should be attached to the job along with any resources on which it depends. Here, we use the : -{/* TODO convert to */} -```python file=/deploying/dask_hello_world_distributed.py startafter=start_distributed_job_marker endbefore=end_distributed_job_marker -from dagster_aws.s3.io_manager import s3_pickle_io_manager -from dagster_aws.s3.resources import s3_resource -from dagster_dask import dask_executor - -from dagster import job, op - - -@op -def hello_world(): - return "Hello, World!" - - -@job( - executor_def=dask_executor, - resource_defs={"io_manager": s3_pickle_io_manager, "s3": s3_resource}, -) -def distributed_dask_job(): - hello_world() -``` + For distributing task execution on a Dask cluster, you must provide a config block that includes the address/port of the Dask scheduler: -{/* TODO convert to */} -```python file=/deploying/dask_remote.yaml -resources: - io_manager: - config: - s3_bucket: your_bucket_name -execution: - config: - cluster: - existing: - address: "dask_scheduler.dns_name:8787" -``` + Since Dask will invoke your job code on the cluster workers, you must ensure that the latest version of your Python code is available to all of the Dask workers. Ideally, you'll package this as a Python module, and target your `workspace.yaml` at this module. diff --git a/docs/docs-beta/docs/guides/deploy/execution/run-monitoring.md b/docs/docs-beta/docs/guides/deploy/execution/run-monitoring.md index a43164a9c962f..9fac70a4a404c 100644 --- a/docs/docs-beta/docs/guides/deploy/execution/run-monitoring.md +++ b/docs/docs-beta/docs/guides/deploy/execution/run-monitoring.md @@ -8,17 +8,7 @@ Dagster can detect hanging runs and restart crashed [run workers](/guides/deploy - Running the Dagster Daemon - Enabling run monitoring in the Dagster Instance: -{/* TODO convert to */} -```yaml file=/deploying/dagster_instance/dagster.yaml startafter=start_run_monitoring endbefore=end_run_monitoring -# Opt in to run monitoring -run_monitoring: - enabled: true - # values below are the defaults, and don't need to be specified except to override them - start_timeout_seconds: 180 - cancel_timeout_seconds: 180 - max_resume_run_attempts: 3 # experimental if above 0 - poll_interval_seconds: 120 -``` + :::note @@ -55,19 +45,7 @@ run_monitoring: The below code example shows how to set a run timeout of 10 seconds on a per-job basis: -```python file=/deploying/monitoring_daemon/run_timeouts.py startafter=start_timeout -from dagster import define_asset_job, job - - -@job(tags={"dagster/max_runtime": 10}) -def my_job(): ... - - -asset_job = define_asset_job( - name="some_job", selection="*", tags={"dagster/max_runtime": 10} -) -# end_timeout -``` + ## Detecting run worker crashes diff --git a/docs/docs-beta/docs/guides/deploy/execution/run-retries.md b/docs/docs-beta/docs/guides/deploy/execution/run-retries.md index 05a9c7ea795a2..6016a92584d3d 100644 --- a/docs/docs-beta/docs/guides/deploy/execution/run-retries.md +++ b/docs/docs-beta/docs/guides/deploy/execution/run-retries.md @@ -22,20 +22,7 @@ run_retries: In both Dagster+ and Dagster Open Source, you can also configure retries using tags either on Job definitions or in the Dagster UI [Launchpad](/guides/operate/webserver). -{/* TODO convert to */} -```python file=/deploying/job_retries.py -from dagster import job - - -@job(tags={"dagster/max_retries": 3}) -def sample_job(): - pass - - -@job(tags={"dagster/max_retries": 3, "dagster/retry_strategy": "ALL_STEPS"}) -def other_sample_sample_job(): - pass -``` + ### Retry Strategy diff --git a/docs/docs-beta/docs/guides/deploy/using-environment-variables-and-secrets.md b/docs/docs-beta/docs/guides/deploy/using-environment-variables-and-secrets.md index 12321c96aeba1..d1f7dd33662ef 100644 --- a/docs/docs-beta/docs/guides/deploy/using-environment-variables-and-secrets.md +++ b/docs/docs-beta/docs/guides/deploy/using-environment-variables-and-secrets.md @@ -193,22 +193,7 @@ Let's review what's happening here: As storing secrets in configuration is bad practice, we'll opt for using an environment variable. In this code, we're configuring the resource supplying it to our assets: -{/* TODO convert to */} -```python file=/guides/dagster/using_environment_variables_and_secrets/repository.py startafter=start endbefore=end -# definitions.py - -from my_dagster_project import assets -from my_dagster_project.resources import GithubClientResource - -from dagster import Definitions, EnvVar, load_assets_from_package_module - -defs = Definitions( - assets=load_assets_from_package_module(assets), - resources={ - "github_api": GithubClientResource(access_token=EnvVar("GITHUB_ACCESS_TOKEN")) - }, -) -``` + Let's review what's happening here: @@ -227,37 +212,7 @@ In this example, we'll demonstrate how to use different I/O manager configuratio This example is adapted from the [Transitioning data pipelines from development to production guide](/guides/deploy/dev-to-prod): -{/* TODO convert to */} -```python file=/guides/dagster/using_environment_variables_and_secrets/repository_v2.py startafter=start_new endbefore=end_new -# definitions.py - -resources = { - "local": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user=EnvVar("DEV_SNOWFLAKE_USER"), - password=EnvVar("DEV_SNOWFLAKE_PASSWORD"), - database="LOCAL", - schema=EnvVar("DEV_SNOWFLAKE_SCHEMA"), - ), - }, - "production": { - "snowflake_io_manager": SnowflakePandasIOManager( - account="abc1234.us-east-1", - user="system@company.com", - password=EnvVar("SYSTEM_SNOWFLAKE_PASSWORD"), - database="PRODUCTION", - schema="HACKER_NEWS", - ), - }, -} - -deployment_name = os.getenv("DAGSTER_DEPLOYMENT", "local") - -defs = Definitions( - assets=[items, comments, stories], resources=resources[deployment_name] -) -``` + Let's review what's happening here: From f69a36b8267f5ee6313d2daa5e1bf6e92b521e2f Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 10:38:49 -0700 Subject: [PATCH 7/9] convert operate docs to CodeExample Signed-off-by: nikki everett --- .../configuration/advanced-config-types.md | 274 ++---------------- .../configuration/run-configuration.md | 88 +----- .../guides/operate/graphql/graphql-client.md | 93 +----- .../docs/guides/operate/run-executors.md | 43 +-- 4 files changed, 38 insertions(+), 460 deletions(-) diff --git a/docs/docs-beta/docs/guides/operate/configuration/advanced-config-types.md b/docs/docs-beta/docs/guides/operate/configuration/advanced-config-types.md index 52475b2fa7bfd..a46711e6ba867 100644 --- a/docs/docs-beta/docs/guides/operate/configuration/advanced-config-types.md +++ b/docs/docs-beta/docs/guides/operate/configuration/advanced-config-types.md @@ -12,17 +12,8 @@ Config fields can be annotated with metadata, which can be used to provide addit For example, we can annotate a config field with a description, which will be displayed in the documentation for the config field. We can add a value range to a field, which will be validated when config is specified. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_metadata_config endbefore=end_metadata_config dedent=4 -from dagster import Config -from pydantic import Field - -class MyMetadataConfig(Config): - person_name: str = Field(description="The name of the person to greet") - age: int = Field(gt=0, lt=100, description="The age of the person to greet") - -# errors, since age is not in the valid range! -MyMetadataConfig(person_name="Alice", age=200) -``` +{/* TODO add dedent=4 prop when implemented */} + ## Defaults and optional config fields @@ -30,65 +21,15 @@ Config fields can have an attached default value. Fields with defaults are not r For example, we can attach a default value of `"hello"` to the `greeting_phrase` field, and can construct `MyAssetConfig` without specifying a phrase. Fields which are marked as `Optional`, such as `person_name`, implicitly have a default value of `None`, but can also be explicitly set to `None` as in the example below. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_optional_config endbefore=end_optional_config dedent=4 -from typing import Optional -from dagster import asset, Config, materialize, RunConfig -from pydantic import Field - -class MyAssetConfig(Config): - person_name: Optional[str] = None - - # can pass default to pydantic.Field to attach metadata to the field - greeting_phrase: str = Field( - default="hello", description="The greeting phrase to use." - ) - -@asset -def greeting(config: MyAssetConfig) -> str: - if config.person_name: - return f"{config.greeting_phrase} {config.person_name}" - else: - return config.greeting_phrase - -asset_result = materialize( - [greeting], - run_config=RunConfig({"greeting": MyAssetConfig()}), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ### Required config fields By default, fields which are typed as `Optional` are not required to be specified in the config, and have an implicit default value of `None`. If you want to require that a field be specified in the config, you may use an ellipsis (`...`) to [require that a value be passed](https://docs.pydantic.dev/usage/models/#required-fields). -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_required_config endbefore=end_required_config dedent=4 -from typing import Optional, Callable -from dagster import asset, Config -from pydantic import Field - -class MyAssetConfig(Config): - # ellipsis indicates that even though the type is Optional, - # an input is required - person_first_name: Optional[str] = ... - - # ellipsis can also be used with pydantic.Field to attach metadata - person_last_name: Optional[Callable] = Field( - default=..., description="The last name of the person to greet" - ) - -@asset -def goodbye(config: MyAssetConfig) -> str: - full_name = f"{config.person_first_name} {config.person_last_name}".strip() - if full_name: - return f"Goodbye, {full_name}" - else: - return "Goodbye" - -# errors, since person_first_name and person_last_name are required -goodbye(MyAssetConfig()) - -# works, since both person_first_name and person_last_name are provided -goodbye(MyAssetConfig(person_first_name="Alice", person_last_name=None)) -``` +{/* TODO add dedent=4 prop when implemented */} + ## Basic data structures @@ -100,29 +41,8 @@ Basic Python data structures can be used in your config schemas along with neste For example, we can define a config schema that takes in a list of user names and a mapping of user names to user scores. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_basic_data_structures_config endbefore=end_basic_data_structures_config dedent=4 -from dagster import Config, materialize, asset, RunConfig -from typing import List, Dict - -class MyDataStructuresConfig(Config): - user_names: list[str] - user_scores: dict[str, int] - -@asset -def scoreboard(config: MyDataStructuresConfig): ... - -result = materialize( - [scoreboard], - run_config=RunConfig( - { - "scoreboard": MyDataStructuresConfig( - user_names=["Alice", "Bob"], - user_scores={"Alice": 10, "Bob": 20}, - ) - } - ), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ## Nested schemas @@ -130,66 +50,15 @@ Schemas can be nested in one another, or in basic Python data structures. Here, we define a schema which contains a mapping of user names to complex user data objects. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_nested_schema_config endbefore=end_nested_schema_config dedent=4 -from dagster import asset, materialize, Config, RunConfig -from typing import Dict - -class UserData(Config): - age: int - email: str - profile_picture_url: str - -class MyNestedConfig(Config): - user_data: dict[str, UserData] - -@asset -def average_age(config: MyNestedConfig): ... - -result = materialize( - [average_age], - run_config=RunConfig( - { - "average_age": MyNestedConfig( - user_data={ - "Alice": UserData( - age=10, - email="alice@gmail.com", - profile_picture_url=..., - ), - "Bob": UserData( - age=20, - email="bob@gmail.com", - profile_picture_url=..., - ), - } - ) - } - ), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ## Permissive schemas By default, `Config` schemas are strict, meaning that they will only accept fields that are explicitly defined in the schema. This can be cumbersome if you want to allow users to specify arbitrary fields in their config. For this purpose, you can use the `PermissiveConfig` base class, which allows arbitrary fields to be specified in the config. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_permissive_schema_config endbefore=end_permissive_schema_config dedent=4 -from dagster import asset, PermissiveConfig -from typing import Optional -import requests - -class FilterConfig(PermissiveConfig): - title: Optional[str] = None - description: Optional[str] = None - -@asset -def filtered_listings(config: FilterConfig): - # extract all config fields, including those not defined in the schema - url_params = config.dict() - return requests.get("https://my-api.com/listings", params=url_params).json() - -# can pass in any fields, including those not defined in the schema -filtered_listings(FilterConfig(title="hotel", beds=4)) -``` +{/* TODO add dedent=4 prop when implemented */} + ## Union types @@ -197,41 +66,8 @@ Union types are supported using Pydantic [discriminated unions](https://docs.pyd Here, we define a config schema which takes in a `pet` field, which can be either a `Cat` or a `Dog`, as indicated by the `pet_type` field. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_union_schema_config endbefore=end_union_schema_config dedent=4 -from dagster import asset, materialize, Config, RunConfig -from pydantic import Field -from typing import Union -from typing_extensions import Literal - -class Cat(Config): - pet_type: Literal["cat"] = "cat" - meows: int - -class Dog(Config): - pet_type: Literal["dog"] = "dog" - barks: float - -class ConfigWithUnion(Config): - pet: Union[Cat, Dog] = Field(discriminator="pet_type") - -@asset -def pet_stats(config: ConfigWithUnion): - if isinstance(config.pet, Cat): - return f"Cat meows {config.pet.meows} times" - else: - return f"Dog barks {config.pet.barks} times" - -result = materialize( - [pet_stats], - run_config=RunConfig( - { - "pet_stats": ConfigWithUnion( - pet=Cat(meows=10), - ) - } - ), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ### YAML and config dictionary representations of union types @@ -259,41 +95,8 @@ In the config dictionary representation, the same pattern is used: Python enums which subclass `Enum` are supported as config fields. Here, we define a schema that takes in a list of users, whose roles are specified as enum values: -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_enum_schema_config endbefore=end_enum_schema_config dedent=4 -from dagster import Config, RunConfig, op, job -from enum import Enum - -class UserPermissions(Enum): - GUEST = "guest" - MEMBER = "member" - ADMIN = "admin" - -class ProcessUsersConfig(Config): - users_list: dict[str, UserPermissions] - -@op -def process_users(config: ProcessUsersConfig): - for user, permission in config.users_list.items(): - if permission == UserPermissions.ADMIN: - print(f"{user} is an admin") - -@job -def process_users_job(): - process_users() - -op_result = process_users_job.execute_in_process( - run_config=RunConfig( - { - "process_users": ProcessUsersConfig( - users_list={ - "Bob": UserPermissions.GUEST, - "Alice": UserPermissions.ADMIN, - } - ) - } - ), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ### YAML and config dictionary representations of enum types @@ -322,47 +125,4 @@ Config fields can have custom validation logic applied using [Pydantic validator Here, we define some validators on a configured user's name and username, which will throw exceptions if incorrect values are passed in the launchpad or from a schedule or sensor. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_validated_schema_config endbefore=end_validated_schema_config dedent=4 -from dagster import Config, RunConfig, op, job -from pydantic import validator - -class UserConfig(Config): - name: str - username: str - - @validator("name") - def name_must_contain_space(cls, v): - if " " not in v: - raise ValueError("must contain a space") - return v.title() - - @validator("username") - def username_alphanumeric(cls, v): - assert v.isalnum(), "must be alphanumeric" - return v - -executed = {} - -@op -def greet_user(config: UserConfig) -> None: - print(f"Hello {config.name}!") - executed["greet_user"] = True - -@job -def greet_user_job() -> None: - greet_user() - -# Input is valid, so this will work -op_result = greet_user_job.execute_in_process( - run_config=RunConfig( - {"greet_user": UserConfig(name="Alice Smith", username="alice123")} - ), -) - -# Name has no space, so this will fail -op_result = greet_user_job.execute_in_process( - run_config=RunConfig( - {"greet_user": UserConfig(name="John", username="johndoe44")} - ), -) -``` + diff --git a/docs/docs-beta/docs/guides/operate/configuration/run-configuration.md b/docs/docs-beta/docs/guides/operate/configuration/run-configuration.md index a5541448ddf42..431c17faee46e 100644 --- a/docs/docs-beta/docs/guides/operate/configuration/run-configuration.md +++ b/docs/docs-beta/docs/guides/operate/configuration/run-configuration.md @@ -23,32 +23,15 @@ During execution, the specified config is accessed within the body of the op or Here, we define a subclass of holding a single string value representing the name of a user. We can access the config through the `config` parameter in the asset body. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_basic_asset_config endbefore=end_basic_asset_config dedent=4 -from dagster import asset, Config - -class MyAssetConfig(Config): - person_name: str - -@asset -def greeting(config: MyAssetConfig) -> str: - return f"hello {config.person_name}" -``` +{/* TODO add dedent=4 prop when implemented */} + Here, we define a subclass of holding a single string value representing the name of a user. We can access the config through the `config` parameter in the op body. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_basic_op_config endbefore=end_basic_op_config dedent=4 -from dagster import op, Config - -class MyOpConfig(Config): - person_name: str - -@op -def print_greeting(config: MyOpConfig): - print(f"hello {config.person_name}") -``` + You can also build config into jobs. @@ -61,15 +44,8 @@ These examples showcase the most basic config types that can be used. For more i Configurable parameters for a resource are defined by specifying attributes for a resource class, which subclasses . The below resource defines a configurable connection URL, which can be accessed in any methods defined on the resource. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_basic_resource_config endbefore=end_basic_resource_config dedent=4 -from dagster import op, ConfigurableResource - -class MyDatabaseResource(ConfigurableResource): - connection_url: str - - def query(self, query: str): - return get_engine(self.connection_url).execute(query) -``` +{/* TODO add dedent=4 prop when implemented */} + For more information on using resources, refer to the [Resources guide](/guides/build/external-resources/). @@ -82,22 +58,8 @@ To execute a job or materialize an asset that specifies config, you'll need to p When specifying config from the Python API, we can use the `run_config` argument for or . This takes a object, within which we can supply config on a per-op or per-asset basis. The config is specified as a dictionary, with the keys corresponding to the op/asset names and the values corresponding to the config values. -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_execute_with_config endbefore=end_execute_with_config dedent=4 -from dagster import job, materialize, op, RunConfig - -@job -def greeting_job(): - print_greeting() - -job_result = greeting_job.execute_in_process( - run_config=RunConfig({"print_greeting": MyOpConfig(person_name="Alice")}) -) - -asset_result = materialize( - [greeting], - run_config=RunConfig({"greeting": MyAssetConfig(person_name="Alice")}), -) -``` +{/* TODO add dedent=4 prop when implemented */} + @@ -135,43 +97,15 @@ dagster job execute --config my_config.yaml Dagster validates any provided run config against the corresponding Pydantic model. It will abort execution with a or Pydantic `ValidationError` if validation fails. For example, both of the following will fail, because there is no `nonexistent_config_value` in the config schema: -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_execute_with_bad_config endbefore=end_execute_with_bad_config dedent=4 -@job -def greeting_job(): - print_greeting() - -op_result = greeting_job.execute_in_process( - run_config=RunConfig( - {"print_greeting": MyOpConfig(nonexistent_config_value=1)} - ), -) - -asset_result = materialize( - [greeting], - run_config=RunConfig({"greeting": MyAssetConfig(nonexistent_config_value=1)}), -) -``` +{/* TODO add dedent=4 prop when implemented */} + ### Using environment variables with config Assets and ops can be configured using environment variables by passing an when constructing a config object. This is useful when the value is sensitive or may vary based on environment. If using Dagster+, environment variables can be [set up directly in the UI](/guides/deploy/using-environment-variables-and-secrets). -```python file=/guides/dagster/pythonic_config/pythonic_config.py startafter=start_execute_with_config_envvar endbefore=end_execute_with_config_envvar dedent=4 -from dagster import job, materialize, op, RunConfig, EnvVar - -job_result = greeting_job.execute_in_process( - run_config=RunConfig( - {"print_greeting": MyOpConfig(person_name=EnvVar("PERSON_NAME"))} - ) -) - -asset_result = materialize( - [greeting], - run_config=RunConfig( - {"greeting": MyAssetConfig(person_name=EnvVar("PERSON_NAME"))} - ), -) -``` +{/* TODO add dedent=4 prop when implemented */} + Refer to the [Environment variables and secrets guide](/guides/deploy/using-environment-variables-and-secrets) for more general info about environment variables in Dagster. diff --git a/docs/docs-beta/docs/guides/operate/graphql/graphql-client.md b/docs/docs-beta/docs/guides/operate/graphql/graphql-client.md index bdd1d59aa7632..0d897148bc20a 100644 --- a/docs/docs-beta/docs/guides/operate/graphql/graphql-client.md +++ b/docs/docs-beta/docs/guides/operate/graphql/graphql-client.md @@ -51,22 +51,11 @@ Note that all GraphQL methods on the API are not yet available in Python - the ` The snippet below shows example instantiation of the client: -{/* TODO convert to */} -```python file=/concepts/webserver/graphql/client_example.py startafter=start_setup_marker endbefore=end_setup_marker -from dagster_graphql import DagsterGraphQLClient - -client = DagsterGraphQLClient("localhost", port_number=3000) -``` + If you are using Dagster+, you can configure your client against the Dagster+ API by passing your deployment-specific URL and a User Token to the client as follows: -```python file=/concepts/webserver/graphql/client_example.py startafter=start_cloud_usage endbefore=end_cloud_usage -url = "yourorg.dagster.cloud/prod" # Your deployment-scoped url -user_token = ( # a User Token generated from the Organization Settings page in Dagster+. - "your_token_here" -) -client = DagsterGraphQLClient(url, headers={"Dagster-Cloud-Api-Token": user_token}) -``` + ## Examples @@ -74,22 +63,7 @@ client = DagsterGraphQLClient(url, headers={"Dagster-Cloud-Api-Token": user_toke You can use the client to get the status of a job run as follows: -{/* TODO convert to */} -```python file=/concepts/webserver/graphql/client_example.py startafter=start_run_status_marker endbefore=end_run_status_marker -from dagster_graphql import DagsterGraphQLClientError - -from dagster import DagsterRunStatus - -try: - status: DagsterRunStatus = client.get_run_status(RUN_ID) - if status == DagsterRunStatus.SUCCESS: - do_something_on_success() - else: - do_something_else() -except DagsterGraphQLClientError as exc: - do_something_with_exc(exc) - raise exc -``` + ### Reloading all repositories in a repository location @@ -97,38 +71,13 @@ You can also reload a repository location in a Dagster deployment. This reloads all repositories in that repository location. This is useful in a variety of contexts, including refreshing the Dagster UI without restarting the server. Example usage is as follows: -```python file=/concepts/webserver/graphql/client_example.py startafter=start_reload_repo_location_marker endbefore=end_reload_repo_location_marker -from dagster_graphql import ReloadRepositoryLocationInfo, ReloadRepositoryLocationStatus - -reload_info: ReloadRepositoryLocationInfo = client.reload_repository_location(REPO_NAME) -if reload_info.status == ReloadRepositoryLocationStatus.SUCCESS: - do_something_on_success() -else: - raise Exception( - "Repository location reload failed because of a " - f"{reload_info.failure_type} error: {reload_info.message}" - ) -``` + ### Submitting a job run You can use the client to submit a job run as follows: -```python file=/concepts/webserver/graphql/client_example.py startafter=start_submit_marker_default endbefore=end_submit_marker_default -from dagster_graphql import DagsterGraphQLClientError - -try: - new_run_id: str = client.submit_job_execution( - JOB_NAME, - repository_location_name=REPO_LOCATION_NAME, - repository_name=REPO_NAME, - run_config={}, - ) - do_something_on_success(new_run_id) -except DagsterGraphQLClientError as exc: - do_something_with_exc(exc) - raise exc -``` + ### Shutting down a repository location server @@ -138,37 +87,11 @@ One way to cause your server to restart and your repositories to be reloaded is Example usage: -{/* TODO convert to */} -```python file=/concepts/webserver/graphql/client_example.py startafter=start_shutdown_repo_location_marker endbefore=end_shutdown_repo_location_marker -from dagster_graphql import ( - ShutdownRepositoryLocationInfo, - ShutdownRepositoryLocationStatus, -) - -shutdown_info: ShutdownRepositoryLocationInfo = client.shutdown_repository_location( - REPO_NAME -) -if shutdown_info.status == ShutdownRepositoryLocationStatus.SUCCESS: - do_something_on_success() -else: - raise Exception(f"Repository location shutdown failed: {shutdown_info.message}") -``` + + #### Repository location and repository inference Note that specifying the repository location name and repository name are not always necessary; the GraphQL client will infer the repository name and repository location name if the job name is unique. -{/* TODO convert to */} -```python file=/concepts/webserver/graphql/client_example.py startafter=start_submit_marker_job_name_only endbefore=end_submit_marker_job_name_only -from dagster_graphql import DagsterGraphQLClientError - -try: - new_run_id: str = client.submit_job_execution( - JOB_NAME, - run_config={}, - ) - do_something_on_success(new_run_id) -except DagsterGraphQLClientError as exc: - do_something_with_exc(exc) - raise exc -``` + diff --git a/docs/docs-beta/docs/guides/operate/run-executors.md b/docs/docs-beta/docs/guides/operate/run-executors.md index c2e14f8934628..1db8692a6363c 100644 --- a/docs/docs-beta/docs/guides/operate/run-executors.md +++ b/docs/docs-beta/docs/guides/operate/run-executors.md @@ -26,23 +26,7 @@ Every job has an executor. The default executor is the to the `executor_def` parameter of or : -{/* TODO convert to */} -```python file=/deploying/executors/executors.py startafter=start_executor_on_job endbefore=end_executor_on_job -from dagster import graph, job, multiprocess_executor - - -# Providing an executor using the job decorator -@job(executor_def=multiprocess_executor) -def the_job(): ... - - -@graph -def the_graph(): ... - - -# Providing an executor using graph_def.to_job(...) -other_job = the_graph.to_job(executor_def=multiprocess_executor) -``` + ### For a code location @@ -50,30 +34,7 @@ To specify a default executor for all jobs and assets provided to a code locatio If a job explicitly specifies an executor, then that executor will be used. Otherwise, jobs that don't specify an executor will use the default provided to the code location: -{/* TODO convert to */} -```python file=/deploying/executors/executors.py startafter=start_executor_on_repo endbefore=end_executor_on_repo -from dagster import multiprocess_executor, define_asset_job, asset, Definitions - - -@asset -def the_asset(): - pass - - -asset_job = define_asset_job("the_job", selection="*") - - -@job -def op_job(): ... - - -# op_job and asset_job will both use the multiprocess_executor, -# since neither define their own executor. - -defs = Definitions( - assets=[the_asset], jobs=[asset_job, op_job], executor=multiprocess_executor -) -``` + :::note From f7431cba34124e03fc3be0da3a27ee76499a52c3 Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 11:19:00 -0700 Subject: [PATCH 8/9] convert test docs to CodeExample Signed-off-by: nikki everett --- .../test/running-a-subset-of-asset-checks.md | 73 +-------------- .../testing-partitioned-config-and-jobs.md | 89 +------------------ 2 files changed, 6 insertions(+), 156 deletions(-) diff --git a/docs/docs-beta/docs/guides/test/running-a-subset-of-asset-checks.md b/docs/docs-beta/docs/guides/test/running-a-subset-of-asset-checks.md index 649f657b070a2..460b6cd8d3319 100644 --- a/docs/docs-beta/docs/guides/test/running-a-subset-of-asset-checks.md +++ b/docs/docs-beta/docs/guides/test/running-a-subset-of-asset-checks.md @@ -22,42 +22,7 @@ Inside the body of the function, we can use `AssetCheckExecutionContext.selected As we don't know in advance which checks will be executed, we explicitly `yield` each asset check result that we're expected to create: -```python file=/concepts/assets/asset_checks/subset_multi_asset_check.py -from collections.abc import Iterable - -from dagster import ( - AssetCheckExecutionContext, - AssetCheckKey, - AssetCheckResult, - AssetCheckSpec, - AssetKey, - multi_asset_check, -) - - -@multi_asset_check( - specs=[ - AssetCheckSpec(name="asset_check_one", asset="asset_one"), - AssetCheckSpec(name="asset_check_two", asset="asset_two"), - ], - can_subset=True, -) -def the_check(context: AssetCheckExecutionContext) -> Iterable[AssetCheckResult]: - if ( - AssetCheckKey(AssetKey("asset_one"), "asset_check_one") - in context.selected_asset_check_keys - ): - yield AssetCheckResult( - passed=True, metadata={"foo": "bar"}, check_name="asset_check_one" - ) - if ( - AssetCheckKey(AssetKey("asset_two"), "asset_check_two") - in context.selected_asset_check_keys - ): - yield AssetCheckResult( - passed=True, metadata={"foo": "bar"}, check_name="asset_check_two" - ) -``` + ## Subsetting checks in @multi_assets @@ -65,41 +30,7 @@ When using [multi-assets](/guides/build/assets/defining-assets#multi-asset), Dag In the following example, we only want to execute a check when the `multi_asset_piece_1` asset produced by the `multi_asset_1_and_2` multi-asset is materialized: -{/* TODO convert to */} -```python file=/concepts/assets/asset_checks/subset_check_multi_asset.py -from dagster import ( - AssetCheckKey, - AssetCheckResult, - AssetCheckSpec, - AssetExecutionContext, - AssetKey, - AssetSpec, - MaterializeResult, - multi_asset, -) - - -@multi_asset( - specs=[ - AssetSpec("multi_asset_piece_1", group_name="asset_checks", skippable=True), - AssetSpec("multi_asset_piece_2", group_name="asset_checks", skippable=True), - ], - check_specs=[AssetCheckSpec("my_check", asset="multi_asset_piece_1")], - can_subset=True, -) -def multi_asset_1_and_2(context: AssetExecutionContext): - if AssetKey("multi_asset_piece_1") in context.selected_asset_keys: - yield MaterializeResult(asset_key="multi_asset_piece_1") - # The check will only execute when multi_asset_piece_1 is materialized - if ( - AssetCheckKey(AssetKey("multi_asset_piece_1"), "my_check") - in context.selected_asset_check_keys - ): - yield AssetCheckResult(passed=True, metadata={"foo": "bar"}) - if AssetKey("multi_asset_piece_2") in context.selected_asset_keys: - # No check on multi_asset_piece_2 - yield MaterializeResult(asset_key="multi_asset_piece_2") -``` + Let's review what we did to accomplish this: diff --git a/docs/docs-beta/docs/guides/test/testing-partitioned-config-and-jobs.md b/docs/docs-beta/docs/guides/test/testing-partitioned-config-and-jobs.md index 02a1b4d0e27f1..8f243823e6af8 100644 --- a/docs/docs-beta/docs/guides/test/testing-partitioned-config-and-jobs.md +++ b/docs/docs-beta/docs/guides/test/testing-partitioned-config-and-jobs.md @@ -18,90 +18,12 @@ Invoking a function. -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/partitioned_config_test.py startafter=start_partition_config endbefore=end_partition_config -from dagster import validate_run_config, daily_partitioned_config -from datetime import datetime - - -@daily_partitioned_config(start_date=datetime(2020, 1, 1)) -def my_partitioned_config(start: datetime, _end: datetime): - return { - "ops": { - "process_data_for_date": {"config": {"date": start.strftime("%Y-%m-%d")}} - } - } - - -def test_my_partitioned_config(): - # assert that the decorated function returns the expected output - run_config = my_partitioned_config(datetime(2020, 1, 3), datetime(2020, 1, 4)) - assert run_config == { - "ops": {"process_data_for_date": {"config": {"date": "2020-01-03"}}} - } - - # assert that the output of the decorated function is valid configuration for the - # partitioned_op_job job - assert validate_run_config(partitioned_op_job, run_config) -``` + If you want to test that a creates the partitions you expect, use the `get_partition_keys` or `get_run_config_for_partition_key` functions: -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/partitioned_config_test.py startafter=start_partition_keys endbefore=end_partition_keys -from dagster import Config, OpExecutionContext - - -@daily_partitioned_config(start_date=datetime(2020, 1, 1), minute_offset=15) -def my_offset_partitioned_config(start: datetime, _end: datetime): - return { - "ops": { - "process_data": { - "config": { - "start": start.strftime("%Y-%m-%d-%H:%M"), - "end": _end.strftime("%Y-%m-%d-%H:%M"), - } - } - } - } - - -class ProcessDataConfig(Config): - start: str - end: str - -@op -def process_data(context: OpExecutionContext, config: ProcessDataConfig): - s = config.start - e = config.end - context.log.info(f"processing data for {s} - {e}") - - -@job(config=my_offset_partitioned_config) -def do_more_stuff_partitioned(): - process_data() - - -def test_my_offset_partitioned_config(): - # test that the partition keys are what you expect - keys = my_offset_partitioned_config.get_partition_keys() - assert keys[0] == "2020-01-01" - assert keys[1] == "2020-01-02" - - # test that the run_config for a partition is valid for partitioned_op_job - run_config = my_offset_partitioned_config.get_run_config_for_partition_key(keys[0]) - assert validate_run_config(do_more_stuff_partitioned, run_config) - - # test that the contents of run_config are what you expect - assert run_config == { - "ops": { - "process_data": { - "config": {"start": "2020-01-01-00:15", "end": "2020-01-02-00:15"} - } - } - } -``` + ## Testing partitioned jobs @@ -109,8 +31,5 @@ def test_my_offset_partitioned_config(): To run a partitioned job in-process on a particular partition, supply a value for the `partition_key` argument of [`dagster.JobDefinition.execute_in_process`](/api/python-api/execution): -{/* TODO convert to */} -```python file=/concepts/partitions_schedules_sensors/partitioned_job_test.py startafter=start endbefore=end -def test_partitioned_op_job(): - assert partitioned_op_job.execute_in_process(partition_key="2020-01-01").success -``` + + From 49b78e031b2dbe2d0bfae6d5dd3d4ed674b6dcc0 Mon Sep 17 00:00:00 2001 From: nikki everett Date: Thu, 6 Feb 2025 11:22:32 -0700 Subject: [PATCH 9/9] last CodeExample conversion Signed-off-by: nikki everett --- docs/docs-beta/docs/guides/monitor/logging/index.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/docs-beta/docs/guides/monitor/logging/index.md b/docs/docs-beta/docs/guides/monitor/logging/index.md index 06b18c8ee1dac..0d0ac28be1034 100644 --- a/docs/docs-beta/docs/guides/monitor/logging/index.md +++ b/docs/docs-beta/docs/guides/monitor/logging/index.md @@ -66,12 +66,8 @@ Windows / Azure users may need to enable the environment variable `PYTHONLEGACYW Loggers can be configured when you run a job. For example, to filter all messages below `ERROR` out of the colored console logger, add the following lines to your `config.yaml`: -```yaml file=/concepts/logging/config.yaml -loggers: - console: - config: - log_level: ERROR -``` + + When a job with the above configuration is executed, you'll only see the `ERROR` level logs.