Skip to content

Commit

Permalink
feat: add option to embed data when exporting report to notebook (#206)
Browse files Browse the repository at this point in the history
Resolves #21
  • Loading branch information
mbelak-dtml committed Mar 7, 2024
1 parent 8f63194 commit 9e6ec93
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 8 deletions.
88 changes: 81 additions & 7 deletions edvart/report.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import base64
import logging
import os
import pickle
import sys
import warnings
from abc import ABC
from copy import copy
from enum import auto
from typing import List, Optional, Tuple, Union

import isort
Expand All @@ -30,11 +33,25 @@
from edvart.report_sections.univariate_analysis import UnivariateAnalysis
from edvart.utils import env_var

if sys.version_info < (3, 11):
# Python 3.11+ StrEnum behaves as LowercaseStrEnum from strenum package
from strenum import LowercaseStrEnum as StrEnum
else:
from enum import StrEnum


class EmptyReportWarning(UserWarning):
"""Warning raised when a report contains no sections."""


class ExportDataMode(StrEnum):
"""Data export mode for the report."""

NONE = auto()
FILE = auto()
EMBED = auto()


class ReportBase(ABC):
"""
Abstract base class for reports.
Expand All @@ -55,6 +72,8 @@ class ReportBase(ABC):
"import plotly.io as pio",
}

_DEFAULT_LOAD_DATA_CODE = "df = ... # TODO: Fill in code for loading data"

def __init__(
self,
dataframe: pd.DataFrame,
Expand Down Expand Up @@ -84,27 +103,77 @@ def show(self) -> None:
for section in self.sections:
section.show(self.df)

def _export_data(
self, export_data_mode: ExportDataMode, notebook_file_path: Union[str, os.PathLike]
) -> Tuple[str, List[str]]:
"""
Generates code for loading exported data into the exported notebook.
Parameters
----------
export_data_mode : ExportDataMode
The mode of exporting the data.
notebook_file_path : str or PathLike
Filepath of the exported notebook.
-------
Tuple[str, List[str]]
A tuple containing the code for loading the data and a list of imports required for
the code.
"""
if export_data_mode == ExportDataMode.NONE:
return self._DEFAULT_LOAD_DATA_CODE, []
if export_data_mode == ExportDataMode.FILE:
parquet_file_name = str(notebook_file_path).rstrip(".ipynb") + "-data.parquet"
self.df.to_parquet(parquet_file_name)
return f"df = pd.read_parquet('{parquet_file_name}')", ["import pandas as pd"]
assert export_data_mode == ExportDataMode.EMBED
buffer = base64.b85encode(self.df.to_parquet())
return (
code_dedent(
f"""
df_parquet = BytesIO(base64.b85decode({buffer}.decode()))
df = pd.read_parquet(df_parquet)"""
),
["import base64", "import pandas as pd", "from io import BytesIO"],
)

def export_notebook(
self,
notebook_filepath: str,
notebook_filepath: Union[str, os.PathLike],
dataset_name: str = "[INSERT DATASET NAME]",
dataset_description: str = "[INSERT DATASET DESCRIPTION]",
export_data_mode: ExportDataMode = ExportDataMode.NONE,
) -> None:
"""Exports the report as an .ipynb file.
Parameters
----------
notebook_filepath : str
notebook_filepath : str or PathLike
Filepath of the exported notebook.
dataset_name : str (default = "[INSERT DATASET NAME]")
Name of dataset to be used in the title of the report.
dataset_description : str (default = "[INSERT DATASET DESCRIPTION]")
Description of dataset to be used below the title of the report.
export_data_mode : ExportDataMode (default = ExportDataMode.NONE)
Mode for exporting the data to the notebook.
If ExportDataMode.NONE, the data is not exported to the notebook.
If ExportDataMode.FILE, the data is exported to a parquet file
and loaded from there.
If ExportDataMode.EMBED, the data is embedded into the notebook
as a base64 string.
"""
load_data_code, load_data_imports = self._export_data(
export_data_mode, notebook_file_path=notebook_filepath
)
# Generate a notebook containing dataset name and description
self._warn_if_empty()
nb = self._generate_notebook(
dataset_name=dataset_name, dataset_description=dataset_description
dataset_name=dataset_name,
dataset_description=dataset_description,
load_data_code=load_data_code,
hide_load_data_code=export_data_mode == ExportDataMode.EMBED,
extra_imports=load_data_imports,
)

# Save notebook to file
Expand All @@ -113,9 +182,10 @@ def export_notebook(

def _generate_notebook(
self,
load_data_code: str,
hide_load_data_code: bool,
dataset_name: str = "[INSERT DATASET NAME]",
dataset_description: str = "[INSERT DATASET DESCRIPTION]",
load_df: str = "df = ...",
extra_imports: Optional[List[str]] = None,
show_load_data: bool = True,
) -> nbf.NotebookNode:
Expand All @@ -127,7 +197,7 @@ def _generate_notebook(
Name of dataset to be used in the title of the report.
dataset_description : str (default = "[INSERT DATASET DESCRIPTION]")
Description of dataset to be used below the title of the report.
load_df : str (default = 'df = ...')
load_data_code : str (default = 'df = ...')
Code string for loading a dataset to variable `df`.
extra_imports : List[str], optional
Any additional imports to be included in imports section
Expand Down Expand Up @@ -166,7 +236,10 @@ def _generate_notebook(
# Add load data cell
if show_load_data:
nb["cells"].append(nbf4.new_markdown_cell("## Load Data\n---"))
nb["cells"].append(nbf4.new_code_cell(load_df))
load_data_cell = nbf4.new_code_cell(load_data_code)
if hide_load_data_code:
load_data_cell["metadata"] = {"jupyter": {"source_hidden": True}}
nb["cells"].append(load_data_cell)

# Generate code for each report section
if self._table_of_contents is not None:
Expand Down Expand Up @@ -313,7 +386,8 @@ def _dev_export_notebook(self, notebook_filepath: str) -> None:
"""
nb = self._generate_notebook(
extra_imports=["import edvart"],
load_df="df = edvart.example_datasets.dataset_titanic()",
load_data_code="df = edvart.example_datasets.dataset_titanic()",
hide_load_data_code=False,
)

# Save notebook to file
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ umap-learn = { version = "^0.5.4", optional = true }
numba = { version = "^0.59", optional = true }
pyarrow = { version = "^14.0.1", optional = true }
isort = "^5.10.1"
strenum = { version = "^0.4.15", python = "<3.11" }

[tool.poetry.extras]
umap = ["umap-learn", "numba"]
Expand Down
36 changes: 35 additions & 1 deletion tests/test_report.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import pathlib
import warnings
from contextlib import redirect_stdout

import nbconvert
import nbformat
import numpy as np
import pandas as pd

from edvart.report import DefaultReport, Report
from edvart.report import DefaultReport, ExportDataMode, Report
from edvart.report_sections.bivariate_analysis import BivariateAnalysis
from edvart.report_sections.section_base import Verbosity
from edvart.report_sections.univariate_analysis import UnivariateAnalysis
Expand Down Expand Up @@ -90,3 +93,34 @@ def test_show():
warnings.simplefilter("ignore", UserWarning)
with redirect_stdout(None):
report.show()


def test_notebook_export(tmp_path: pathlib.Path):
report = Report(dataframe=_get_test_df())

report.add_overview()
for export_data_mode in (
ExportDataMode.NONE,
ExportDataMode.EMBED,
ExportDataMode.FILE,
"embed",
"none",
"file",
):
report.export_notebook(
tmp_path / f"export_{export_data_mode}.ipynb", export_data_mode=export_data_mode
)


def test_exported_notebook_executes(tmp_path: pathlib.Path):
report = Report(dataframe=_get_test_df())

report.add_overview()
for export_data_mode in (ExportDataMode.EMBED, ExportDataMode.FILE):
export_path = tmp_path / "export_{export_data_mode}.ipynb"
report.export_notebook(export_path, export_data_mode=export_data_mode)

notebook = nbformat.read(export_path, as_version=4)
preprocessor = nbconvert.preprocessors.ExecutePreprocessor(timeout=60)

preprocessor.preprocess(notebook)

0 comments on commit 9e6ec93

Please sign in to comment.