airbytehq · aaronsteers · Aug 16, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+venv
+.venv
+.venv-*
 .gradle
 .idea
 *.iml

diff --git a/airbyte-cdk/python/.gitignore b/airbyte-cdk/python/.gitignore
@@ -1,3 +1,4 @@
+test_response.txt
 .coverage
 
 # TODO: these are tmp files generated by unit tests. They should go to the /tmp directory.

diff --git a/airbyte-cdk/python/airbyte_cdk/entrypoint.py b/airbyte-cdk/python/airbyte_cdk/entrypoint.py
@@ -6,16 +6,19 @@
 import importlib
 import ipaddress
 import logging
+import os
 import os.path
 import socket
 import sys
 import tempfile
 from collections import defaultdict
 from functools import wraps
-from typing import Any, DefaultDict, Iterable, List, Mapping, Optional
+from typing import Any, DefaultDict, Iterable, List, Mapping, Optional, TextIO
 from urllib.parse import urlparse
 
 import requests
+from requests import PreparedRequest, Response, Session
+
 from airbyte_cdk.connector import TConfig
 from airbyte_cdk.exception_handler import init_uncaught_exception_handler
 from airbyte_cdk.logger import init_logger
@@ -235,14 +238,33 @@ def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
         return
 
 
-def launch(source: Source, args: List[str]) -> None:
+def launch(
+    source: Source,
+    args: List[str],
+    output_stream: TextIO = None,
+) -> None:
+    """Launch the source connector with the given arguments.
+
+    Optionally, you can provide an output stream to redirect the output of the source connector.
+    The default is `sys.stdout` but you can also send to `os.devnull` to suppress output,
+    or any other file-like object.
+    """
+    output_stream = output_stream or sys.stdout
     source_entrypoint = AirbyteEntrypoint(source)
     parsed_args = source_entrypoint.parse_args(args)
+    record_iterator = source_entrypoint.run(parsed_args)
+
+    if output_stream is os.devnull:
+        # Skip printing:
+        for _ in record_iterator:
+            pass
+        return
+
     with PrintBuffer():
-        for message in source_entrypoint.run(parsed_args):
+        for message in record_iterator:
             # simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
             # the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
-            print(f"{message}\n", end="", flush=True)
+            print(f"{message}\n", end="", flush=True, file=output_stream)
 
 
 def _init_internal_request_filter() -> None:

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte-cdk/python/airbyte_cdk/sources/concurrent_source/concurrent_source.py
@@ -133,7 +133,10 @@ def _handle_item(
         concurrent_stream_processor: ConcurrentReadProcessor,
     ) -> Iterable[AirbyteMessage]:
         # handle queue item and call the appropriate handler depending on the type of the queue item
-        if isinstance(queue_item, StreamThreadException):
+        if isinstance(queue_item, AirbyteMessage):
+            # Most likely a record message, pre-wrapped for perf reasons. Just yield it.
+            yield queue_item
+        elif isinstance(queue_item, StreamThreadException):
             yield from concurrent_stream_processor.on_exception(queue_item)
         elif isinstance(queue_item, PartitionGenerationCompletedSentinel):
             yield from concurrent_stream_processor.on_partition_generation_completed(queue_item)

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py
@@ -1,10 +1,13 @@
 #
 # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
 #
+from __future__ import annotations
 
 from enum import Enum
 from typing import Any, List, Mapping, Optional, Union
 
+from pydantic.v1 import BaseModel, Field, validator
+
 from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
 from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
 from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
@@ -13,7 +16,6 @@
 from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
 from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError
 from airbyte_cdk.sources.file_based.schema_helpers import type_mapping_to_jsonschema
-from pydantic.v1 import BaseModel, Field, validator
 
 PrimaryKeyType = Optional[Union[str, List[str]]]
 
@@ -24,6 +26,34 @@ class ValidationPolicy(Enum):
     wait_for_discover = "Wait for Discover"
 
 
+# TODO: Consider defaulting to DISABLED if unstable
+DEFAULT_BULK_MODE = "DISABLED"
+
+
+class ResolvedBulkMode(Enum):
+    DISABLED = "DISABLED"
+    ENABLED = "ENABLED"
+
+
+class BulkMode(Enum):
+    """Enabled bulk processing for file-based streams.
+
+    The in-memory mode is the fastest but requires enough memory to store all the records in memory.
+    The lazy mode is the slowest but requires the least amount of memory.
+    When bulk-mode is disabled, records are processed individually.
+    """
+
+    DISABLED = "DISABLED"
+    ENABLED = "ENABLED"
+    AUTO = "AUTO"
+
+    def resolve(bulk_mode: BulkMode) -> ResolvedBulkMode:
+        if bulk_mode == BulkMode.AUTO:
+            return ResolvedBulkMode(DEFAULT_BULK_MODE)
+
+        return ResolvedBulkMode.DISABLED if bulk_mode == BulkMode.DISABLED else ResolvedBulkMode.ENABLED
+
+
 class FileBasedStreamConfig(BaseModel):
     name: str = Field(title="Name", description="The name of the stream.")
     globs: Optional[List[str]] = Field(
@@ -71,6 +101,11 @@ class FileBasedStreamConfig(BaseModel):
         default=None,
         gt=0,
     )
+    bulk_mode: BulkMode = Field(
+        title="Bulk Processing Optimizations",
+        description="The bulk processing mode for this stream.",
+        default=BulkMode.AUTO,
+    )
 
     @validator("input_schema", pre=True)
     def validate_input_schema(cls, v: Optional[str]) -> Optional[str]:

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_source.py
@@ -187,43 +187,61 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
                 )
                 self._validate_input_schema(stream_config)
 
-                sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
+                sync_mode: SyncMode | None = self._get_sync_mode_from_catalog(stream_config.name)
+                # Note: sync_mode may be `None` in `check` and `discover` modes.
 
-                if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
-                    cursor = FileBasedFinalStateCursor(
-                        stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository
-                    )
-                    stream = FileBasedStreamFacade.create_from_stream(
-                        self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
+                # Incremental sync but non-concurrent cursor. This is not allowed.
+                if (
+                    hasattr(self, "_concurrency_level") and self._concurrency_level is not None
+                    and not issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
+                ):
+                    self.logger.warning(
+                        "An internal error occurred. The cursor class must be a concurrent "
+                        "cursor if concurrency level is set. "
+                        "Falling back to non-concurrent execution, which may be slower."
                     )
+                    self._concurrency_level = None
 
-                elif (
-                    sync_mode == SyncMode.incremental
-                    and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
-                    and hasattr(self, "_concurrency_level")
-                    and self._concurrency_level is not None
-                ):
-                    assert (
-                        state_manager is not None
-                    ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
-
-                    cursor = self.cursor_cls(
-                        stream_config,
-                        stream_config.name,
-                        None,
-                        stream_state,
-                        self.message_repository,
-                        state_manager,
-                        CursorField(DefaultFileBasedStream.ab_last_mod_col),
+                if not hasattr(self, "_concurrency_level") or self._concurrency_level is None:
+                    # Concurrency not supported for this stream.
+                    cursor = self.cursor_cls(stream_config)
+                    stream = self._make_default_stream(stream_config, cursor)
+                    streams.append(stream)
+                    continue
+
+                # Else, we have a concurrency level set and a valid concurrent cursor class.
+
+                if sync_mode == SyncMode.full_refresh or sync_mode is None:
+                    cursor = FileBasedFinalStateCursor(
+                        stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository
                     )
                     stream = FileBasedStreamFacade.create_from_stream(
                         self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
                     )
-                else:
-                    cursor = self.cursor_cls(stream_config)
-                    stream = self._make_default_stream(stream_config, cursor)
-
+                    streams.append(stream)
+                    continue
+
+                # Else, incremental sync with concurrent cursor:
+
+                assert (
+                    state_manager is not None
+                ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
+
+                cursor = self.cursor_cls(
+                    stream_config,
+                    stream_config.name,
+                    None,
+                    stream_state,
+                    self.message_repository,
+                    state_manager,
+                    CursorField(DefaultFileBasedStream.ab_last_mod_col),
+                )
+                stream = FileBasedStreamFacade.create_from_stream(
+                    self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
+                )
                 streams.append(stream)
+                continue
+
             return streams
 
         except ValidationError as exc:

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_stream_reader.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_based_stream_reader.py
@@ -105,3 +105,50 @@ def get_prefixes_from_globs(globs: List[str]) -> Set[str]:
         """
         prefixes = {glob.split("*")[0] for glob in globs}
         return set(filter(lambda x: bool(x), prefixes))
+
+    def is_polars_supported(self, file: RemoteFile | None = None) -> bool:
+        """
+        Return `True` if Polars is supported and `False` otherwise.
+        Optionally, the method can take a file to check if Polars is supported for that file.
+
+        The default implementation returns True for all files.
+        """
+        try:
+            # If any part of this block raises an exception, we assume Polars is not supported
+            # and we return False.
+            if file and not self.get_fully_qualified_uri(file.uri):
+                return False
+
+            if not self.polars_storage_options:
+                return False
+        except NotImplementedError:
+            return False
+        else:
+            # No exceptions were raised, so we assume Polars is supported.
+            return True
+
+    def get_fully_qualified_uri(
+        self,
+        file_uri: str,
+    ) -> str:
+        """Returns the fully qualified URI for the given file URI.
+
+        For example, if the source uses S3, this method would prepend the bucket name to the URI.
+        """
+        if "://" in file_uri:
+            return file_uri
+
+        raise NotImplementedError(
+            "The `get_fully_qualified_uri()` method is not implemented by class: " + type(self).__name__,
+        )
+
+    @property
+    def polars_storage_options(self) -> dict[str, str]:
+        """Return storage options for the stream reader.
+
+        Raises:
+            NotImplementedError: If the method is not implemented by the concrete class.
+        """
+        raise NotImplementedError(
+            "The `polars_storage_options()` method is not implemented by class: " + type(self).__name__,
+        )
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/file_type_parser.py
@@ -6,6 +6,8 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
 
+import polars as pl
+
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
 from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -81,3 +83,16 @@ def file_read_mode(self) -> FileReadMode:
         The mode in which the file should be opened for reading.
         """
         ...
+
+    def parse_records_as_dataframes(
+        self,
+        config: FileBasedStreamConfig,
+        file: RemoteFile,
+        stream_reader: AbstractFileBasedStreamReader,
+        logger: logging.Logger,
+        discovered_schema: Optional[Mapping[str, SchemaType]],
+    ) -> Iterable[pl.DataFrame | pl.LazyFrame]:
+        """
+        Parse records and emit as iterable of Pandas DataFrames.
+        """
+        raise NotImplementedError
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/jsonl_parser.py
@@ -6,6 +6,8 @@
 import logging
 from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
 
+import polars as pl
+
 from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
 from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
 from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
@@ -87,6 +89,7 @@ def _parse_jsonl_entries(
         logger: logging.Logger,
         read_limit: bool = False,
     ) -> Iterable[Dict[str, Any]]:
+        """Parse records and emit as iterable of dictionaries."""
         with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
             read_bytes = 0
 
@@ -128,3 +131,47 @@ def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
             return bytes("", json.detect_encoding(line))
         elif isinstance(line, str):
             return ""
+
+    def parse_records_as_dataframes(
+        self,
+        config: FileBasedStreamConfig,
+        file: RemoteFile,
+        stream_reader: AbstractFileBasedStreamReader,
+        logger: logging.Logger,
+        discovered_schema: Optional[Mapping[str, SchemaType]],
+    ) -> Iterable[pl.DataFrame | pl.LazyFrame]:
+        """Parse records and emit as iterable of data frames.
+
+        Currently this only returns an iterator containing a single data frame. This may
+        be updated in the future to return an iterator with multiple DataFrames.
+        """
+
+        # The incoming URI is actually a relative path. We need the absolute ref, for
+        # instance: including the 's3://' protocol, bucket name, etc.
+        fully_qualified_uri = stream_reader.get_fully_qualified_uri(file.uri.split("#")[0])
+        storage_options = stream_reader.polars_storage_options
+        logger.info("Using bulk processing mode to read JSONL file: %s", fully_qualified_uri)
+
+        lazyframe: pl.LazyFrame = pl.scan_ndjson(
+            fully_qualified_uri,
+            storage_options=storage_options,
+            row_index_name="_ab_record_index",
+            infer_schema_length=10_000,
+        ).with_columns(
+            pl.lit(file.uri).alias("_ab_source_file_url"),
+            pl.lit(file.last_modified).alias("_ab_source_file_last_modified")
+        )
+
+        def slice_generator(
+            lazyframe: pl.LazyFrame,
+            batch_size: int = 50_000,
+        ) -> Iterable[pl.DataFrame]:
+            offset = 0
+            while True:
+                slice = lazyframe.slice(offset=offset, length=batch_size).collect(streaming=True)
+                height = slice.height
+                if height == 0:
+                    break
+                yield slice
+
+        yield from slice_generator(lazyframe)