dbt-labs · kensukemorris · Sep 2, 2022 · Sep 6, 2022 · Sep 6, 2022 · Sep 6, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -0,0 +1,82 @@
+version: 2.1
+
+executors:
+  standard:
+    docker:
+      - image: circleci/python:3.8
+    working_directory: "~/lib"
+
+# -----------------
+# Reusable commands
+# -----------------
+
+commands:
+  checkout_source:
+    steps:
+      - restore_cache:
+          keys:
+            - source-{{ .Branch }}-{{ .Revision }}
+            - source-{{ .Branch }}-
+            - source-
+      - checkout
+      - save_cache:
+          key: source-{{ .Branch }}-{{ .Revision }}
+          paths:
+            - "./.git"
+
+  update_virtualenv:
+    steps:
+      - restore_cache:
+          keys:
+            - virtualenv-{{ .Environment.CACHE_VERSION }}-{{ .Branch }}-{{ checksum "Pipfile.lock" }}
+            - virtualenv-{{ .Environment.CACHE_VERSION }}-{{ .Branch }}-
+            - virtualenv-{{ .Environment.CACHE_VERSION }}-
+      - run:
+          name: Install Python packages
+          command: pipenv install --dev --system
+      - save_cache:
+          key: virtualenv-{{ .Environment.CACHE_VERSION }}-{{ .Branch }}-{{ checksum "Pipfile.lock" }}
+          paths:
+            - "./.venv"
+
+  prepare_dbt:
+    steps:
+      - run:
+          name: Installing Simba + UnixODBC
+          command: |
+            sudo mkdir -p /opt/simba/spark/lib/64/
+            sudo tar -xf driver/simba.tar.gz -C /opt/simba/spark/lib/64/
+            sudo apt-get install unixodbc-dev -y
+# --------------
+# Pipeline tasks
+# --------------
+
+jobs:
+  run-python-tests:
+    executor: standard
+    steps:
+      - checkout
+      - update_virtualenv
+      - run:
+          name: Run Python tests
+          command: make run-python-tests
+  run-dbt-project-evaluator:
+    executor: standard
+    environment:
+      SALT: himalayan
+      DBT_PROFILES_DIR: ~/lib/testing/
+      TESTING_PROFILE_NAME: jaffle_shop
+    steps:
+      - checkout
+      - update_virtualenv
+      - run:
+          name: Run dbt-project-evaluator package
+          command: make run-dbt-project-evaluator
+
+
+# Orchestrate our job run sequence
+workflows:
+  test:
+    jobs:
+      - run-python-tests
+      - run-dbt-project-evaluator
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,17 @@
-
 target/
 dbt_modules/
+dbt_packages/
 logs/
+testing/
+package_projects/
+
 **/.DS_Store
+.env
+packages.yml
+selectors.yml
+
+
+# Python temp files
+*.pyc
+__pycache__
+Pipfile.lock
diff --git a/Makefile b/Makefile
@@ -0,0 +1,40 @@
+init:
+	pipenv update
+	pipenv run dbt deps
+
+clean-env:
+	:> .env
+
+env-development-salt:
+	echo SALT=maldon >> .env
+
+env-target:
+	echo DATABRICKS_TARGET=$$(git symbolic-ref --short HEAD | tr /- _) >> .env
+
+package-project:
+	for PACKAGE in dbt_project_evaluator ; do \
+		cp package_projects/$$PACKAGE.yml dbt_packages/$$PACKAGE/dbt_project.yml ; \
+	done
+
+build-env: clean-env env-development-salt env-target package-project
+
+dbt-deps:
+	pipenv run dbt deps
+
+dbt-build: build-env
+	pipenv run dbt build --selector jaffle_shop
+
+run-dbt-project-evaluator: dbt-deps build-env
+	pipenv run dbt --warn-error build --select package:dbt_project_evaluator dbt_project_evaluator_exceptions
+
+lint: build-env
+	pipenv run sqlfluff lint
+
+format: build-env
+	pipenv run sqlfluff fix
+
+run-python-tests:
+	pipenv run pytest --quiet --show-capture=no --tb=no
+
+run-python-tests-detailed:
+	pipenv run pytest
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,17 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+dbt-core = "==1.3.0"
+dbt-databricks = {extras = ["odbc"]}
+pytest-mock = "*"
+glob2 = "*"
+
+[dev-packages]
+pytest = "*"
+pytest-mock = "*"
+
+[requires]
+python_version = "3.8"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,18 +1,13 @@
 ## Testing dbt project: `jaffle_shop`
 
-`jaffle_shop` is a fictional ecommerce store. This dbt project transforms raw data from an app database into a customers and orders model ready for analytics.
-
+`jaffle_shop` is a fictional ecommerce store. This dbt project transforms raw data from an app database into customers and orders warehouse models and some basic analytics models.
 ### What is this repo?
 What this repo _is_:
-- A self-contained playground dbt project, useful for testing out scripts, and communicating some of the core dbt concepts.
+- A repo for aspiring dbt gatekeepers to put their learnings to the test and transform a seemingly unstructured repo into one fit for the Octopus datalake.   
 
 What this repo _is not_:
 - A tutorial — check out the [Getting Started Tutorial](https://docs.getdbt.com/tutorial/setting-up) for that. Notably, this repo contains some anti-patterns to make it self-contained, namely the use of seeds instead of sources.
-- A demonstration of best practices — check out the [dbt Learn Demo](https://github.com/dbt-labs/dbt-learn-demo) repo instead. We want to keep this project as simple as possible. As such, we chose not to implement:
-    - our standard file naming patterns (which make more sense on larger projects, rather than this five-model project)
-    - a pull request flow
-    - CI/CD integrations
-- A demonstration of using dbt for a high-complex project, or a demo of advanced features (e.g. macros, packages, hooks, operations) — we're just trying to keep things simple here!
+- A demonstration of using dbt for a high-complex project, or a demo of advanced features — some of these are included and we'll add to them over time but for now we're just trying to keep things simple here!
 
 ### What's in this repo?
 This repo contains [seeds](https://docs.getdbt.com/docs/building-a-dbt-project/seeds) that includes some (fake) raw data from a fictional app.
@@ -21,51 +16,97 @@ The raw data consists of customers, orders, and payments, with the following ent
 
 ![Jaffle Shop ERD](/etc/jaffle_shop_erd.png)
 
+### How to use this repo and become a gatekeeper?
+
+In its base state the repo is not fit for purpose. While it works, it doesn't comply with many of the conventions we enforce at octopus so your goal is to remedy that.
+
+Use `make init` to get started with running the rest of the make commands. 
+
+### So what needs doing to the repo?
+
+The point of being a gatekeeper is being able to look at a PR and know where to look for possible convention breaches.
+Check the [data platform docs](http://docs.eks.octopus.engineering/reference/dbt_gatekeeper_checklist/) site for tips on how to gatekeep.  
+
+#### Fixes
+
+Here are the fixes that need implementing:
+
+1) All `.yml` files should be renamed to specify what they apply to. For example each model directory should contain a `_models.yml` file (the `_` is to ensure the file is top of the directory for easy access) and may or may not contain a `_docs.yml` file for documentation. 
+2) Staging models should be split by which source they are coming from. As the sources in this repo all come from seeds, the staging models on top of them should be in the `src_seed` directory along with their respective `_models.yml` and `_sources.yml` files.
+3) stg_customers contains PII data in the `first_name` and `last_name` columns so these need to be hashed. Move this model into a `src_seed/sensitive` directory and mark each of the sensitive columns as sensitive in the `src_seed/sensitive/_models.yml` using the syntax:
+    ```
+        columns:
+          - name: customer_id
+            tests:
+              - unique
+              - not_null
+          - name: first_name
+            meta:
+              sensitive: true
+          - name: last_name
+            meta:
+              sensitive: true
+    ```
+4) The `customers.sql` and `orders.sql` models are traditional warehouse models and should be in a `warehouse` directory with their respective `_docs.md` and `_models.yml` files.
+5) We use a package to test the structure of the dbt project called [dbt_project_evaluator](https://github.com/dbt-labs/dbt-project-evaluator) - this tests for lineage issues. One of its major checks is to see if staging models refer to other staging models which is normally not allowed. 
+
+   However, we need to do this when hashing sensitive models so we need to make an exception. To do this, create a new seed called `dbt_project_evaluator_exceptions.csv` with the following content:
+   ```
+   fct_name,column_name,id_to_exclude,comment
+   fct_staging_dependent_on_staging,parent,stg_customers_pii,Scrubbing pii permitted in staging layer.
+   ```
+   This will disable the `fct_staging_dependent_on_staging` test for the `stg_customers_pii` where it is the parent of another staging model and give a reason for why its been omitted: `Scrubbing pii permitted in staging layer.`
+
+   This is a bit niche but dbt_project_evaluator will become a big part of our testing process in future so its important to have an understanding of how it works. 
+#### New Models
+
+You've also had a request from the SMT asking for two dashboards, one for finance and one for sales. They need the following shown:
+- Finance - Total value of orders returned by customer
+- Sales - The customer count by month for customers making their first order
+
+There are two possible approaches to this:
+1) -  How we do things at the time of writing - Create a final model per dashboard showing the relevant information and assign an exposure to each with a dummy URL :
+     ```
+     url: https://inksacio.eks.octopus.engineering/my_certification_dashboard/
+     ```
+   - Put each model into a directory specific to their business unit like `models/final/sales/fnl_sales_newcustomers.sql`
+   - Make sure to write a `_models.yml` in each directory.
+2) How we will do things in future - Make the required data available via [metrics](https://docs.getdbt.com/docs/build/metrics) configured directly on the warehouse model configs or in a `_metrics.yml` file. 
+
+   For example:
+   ```
+   metrics:
+   - name: new_customers
+     label: New Customers
+     model: ref('wh_customers')
+     description: ""
+
+     calculation_method: count_distinct
+     expression: customer_id
+
+     timestamp: first_order
+     time_grains: [day, week, month, quarter, year]
+
+     # general properties
+     config:
+       enabled: true 
+
+     meta: {team: Sales}
+   ```
 
-### Running this project
-To get up and running with this project:
-1. Install dbt using [these instructions](https://docs.getdbt.com/docs/installation).
 
-2. Clone this repository.
 
-3. Change into the `jaffle_shop` directory from the command line:
-```bash
-$ cd jaffle_shop
-```
 
-4. Set up a profile called `jaffle_shop` to connect to a data warehouse by following [these instructions](https://docs.getdbt.com/docs/configure-your-profile). If you have access to a data warehouse, you can use those credentials – we recommend setting your [target schema](https://docs.getdbt.com/docs/configure-your-profile#section-populating-your-profile) to be a new schema (dbt will create the schema for you, as long as you have the right privileges). If you don't have access to an existing data warehouse, you can also setup a local postgres database and connect to it in your profile.
 
-5. Ensure your profile is setup correctly from the command line:
-```bash
-$ dbt debug
-```
 
-6. Load the CSVs with the demo data set. This materializes the CSVs as tables in your target schema. Note that a typical dbt project **does not require this step** since dbt assumes your raw data is already in your warehouse.
-```bash
-$ dbt seed
-```
 
-7. Run the models:
-```bash
-$ dbt run
-```
 
-> **NOTE:** If this steps fails, it might mean that you need to make small changes to the SQL in the models folder to adjust for the flavor of SQL of your target database. Definitely consider this if you are using a community-contributed adapter.
+You can use `make run-python-tests` command to see if your changes have worked or alternatively when you make a PR from your branch CircleCI will run tests to ensure that your changes comply with Octopus conventions. This will run the first set of tests.
 
-8. Test the output of the models:
-```bash
-$ dbt test
-```
+If all your tests pass... You've passed this section of the certification! Let one of the @dbt_gatekeepers know and send them a link to your PR.
+Remember not to merge it, the repo is broken on purpose! 
 
-9. Generate documentation for the project:
-```bash
-$ dbt docs generate
-```
 
-10. View the documentation for the project:
-```bash
-$ dbt docs serve
-```
 
 ### What is a jaffle?
 A jaffle is a toasted sandwich with crimped, sealed edges. Invented in Bondi in 1949, the humble jaffle is an Australian classic. The sealed edges allow jaffle-eaters to enjoy liquid fillings inside the sandwich, which reach temperatures close to the core of the earth during cooking. Often consumed at home after a night out, the most classic filling is tinned spaghetti, while my personal favourite is leftover beef stew with melted cheese.

diff --git a/dbt_project.yml b/dbt_project.yml
@@ -5,11 +5,10 @@ version: '0.1'
 
 profile: 'jaffle_shop'
 
-model-paths: ["models"]
-seed-paths: ["seeds"]
-test-paths: ["tests"]
-analysis-paths: ["analysis"]
-macro-paths: ["macros"]
+macro-paths: ["jaffle_shop/macros"]
+model-paths: ["jaffle_shop/models"]
+seed-paths: ["jaffle_shop/seeds"]
+snapshot-paths: ["jaffle_shop/snapshots"]
 
 target-path: "target"
 clean-targets:
@@ -19,6 +18,10 @@ clean-targets:
 
 require-dbt-version: [">=1.0.0", "<2.0.0"]
 
+dispatch:
+  - macro_namespace: dbt_utils
+    search_order: ['dbt_project_evaluator', 'spark_utils', 'dbt_utils']
+
 models:
   jaffle_shop:
       materialized: table

diff --git a/jaffle_shop/macros/sensitive/_macros.yml b/jaffle_shop/macros/sensitive/_macros.yml
@@ -0,0 +1,45 @@
+version: 2
+
+macros:
+  - name: get_meta_columns
+    description: '{{ doc("get_meta_columns") }}'
+    arguments:
+      - name: model_name
+        type: string
+        description: Name of the dbt model. Typically the file name.
+      - name: meta_key
+        type: string
+        description: The key to look for in the `meta` dictionary.
+      - name: node_type
+        type: string
+        description: Defaults to model. Could be source.
+      - name: project
+        type: string
+        description: Defaults to `datalake_models`. Could be something else.
+  - name: hash_sensitive_columns
+    description: '{{ doc("hash_sensitive_columns") }}'
+    arguments:
+      - name: source_table
+        type: string
+        description: The name of the model to select from.
+  - name: nohash_sensitive_columns
+    description: '{{ doc("nohash_sensitive_columns") }}'
+    arguments:
+      - name: source_table
+        type: string
+        description: The name of the model to select from.
+      - name: join_key
+        type: string
+        description: An optional hashed join key, if the joining key is also sensitive. 
+  - name: get_salt
+    description: '{{ doc("get_salt") }}'
+    arguments:
+      - name: column_name
+        type: string
+        description: Name of the column to get the salt for.
+  - name: hash_of_column
+    description: '{{ doc("hash_of_column") }}'
+    arguments:
+      - name: column
+        type: string
+        description: Name of the column to get SQL for to hash.