-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor rivulet manifest files to use deltacat #464
Changes from all commits
eff47c5
58d0541
e639434
1a1df31
7830701
6737e0b
beecce0
60df7c3
3166252
f9224e0
6d04f77
872f8fe
2e49359
1e7f9af
868ed5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,7 +135,7 @@ def of( | |
f"'{entry_content_type}'" | ||
) | ||
raise ValueError(msg) | ||
entry_content_encoding = meta["content_encoding"] | ||
entry_content_encoding = meta.get("content_encoding", None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was raising a ValueError for manifests with no content encoding (like in Rivulet...). We don't populate lots of fields currently for rivulet deltas/manifests so we will have to update code to populate them (see: #476) |
||
if entry_content_encoding != content_encoding: | ||
msg = ( | ||
f"Expected all manifest entries to have content " | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Protocol, NamedTuple, List | ||
import time | ||
|
||
from deltacat.storage import ( | ||
ManifestMeta, | ||
EntryType, | ||
DeltaLocator, | ||
Delta, | ||
DeltaType, | ||
Transaction, | ||
TransactionType, | ||
TransactionOperation, | ||
TransactionOperationType, | ||
) | ||
from deltacat.storage.model.manifest import Manifest, ManifestEntryList, ManifestEntry | ||
from deltacat.storage.model.transaction import TransactionOperationList | ||
|
||
from deltacat.storage.rivulet import Schema | ||
|
||
StreamPosition = int | ||
"""The stream position for creating a consistent ordering of manifests.""" | ||
TreeLevel = int | ||
"""The level of the manifest in the LSM-tree.""" | ||
|
||
|
||
class DeltaContext(NamedTuple): | ||
"""Minimal amount of manifest context that may need to be circulated independently or alongside individual files""" | ||
|
||
# Schema needed to understand which field group was added when writing manifest | ||
# TODO in the future we should use something like a field group id and keep schema in dataset-level metadata | ||
schema: Schema | ||
stream_position: StreamPosition | ||
level: TreeLevel | ||
|
||
|
||
class RivuletDelta(dict): | ||
""" | ||
Temporary class during merging of deltacat/rivulet metadata formats | ||
|
||
This class currently serves two purposes: | ||
1. Avoid big bang refactor in which consumers of RivuletDelta have to update their code to consume deltacat Delta/Manifest | ||
2. Provide more time to figure out how to represent SST files / schema / etc within deltacat constructs | ||
|
||
""" | ||
|
||
context: DeltaContext | ||
|
||
@staticmethod | ||
def of(delta: Delta) -> RivuletDelta: | ||
riv_delta = RivuletDelta() | ||
riv_delta["dcDelta"] = delta | ||
schema = Schema.from_dict(delta.get("schema")) | ||
riv_delta["DeltaContext"] = DeltaContext( | ||
schema, delta.stream_position, delta.get("level") | ||
) | ||
|
||
return riv_delta | ||
|
||
@property | ||
def dcDelta(self) -> Delta: | ||
return self.get("dcDelta") | ||
|
||
@property | ||
def sst_files(self) -> List[str]: | ||
if "sst_files" not in self.keys(): | ||
self["sst_files"] = [m.uri for m in self.dcDelta.manifest.entries] | ||
return self["sst_files"] | ||
|
||
@sst_files.setter | ||
def sst_files(self, files: List[str]): | ||
self["sst_files"] = files | ||
|
||
@property | ||
def context(self) -> DeltaContext: | ||
return self["DeltaContext"] | ||
|
||
@context.setter | ||
def context(self, mc: DeltaContext): | ||
self["DeltaContext"] = mc | ||
|
||
|
||
class ManifestIO(Protocol): | ||
""" | ||
Minimal interface for reading and writing manifest files | ||
""" | ||
|
||
def write( | ||
self, | ||
sst_files: List[str], | ||
schema: Schema, | ||
level: TreeLevel, | ||
) -> str: | ||
... | ||
|
||
def read(self, file: str) -> RivuletDelta: | ||
... | ||
|
||
|
||
class DeltacatManifestIO(ManifestIO): | ||
""" | ||
Writes manifest data, but by writing to a Deltacat metastore using Deltacat delta/manifest classes | ||
""" | ||
|
||
def __init__(self, root: str): | ||
self.root = root | ||
|
||
def write( | ||
self, | ||
sst_files: List[str], | ||
schema: Schema, | ||
level: TreeLevel, | ||
) -> str: | ||
# Build the Deltacat Manifest entries: | ||
entry_list = ManifestEntryList() | ||
""" | ||
Currently, we use the "data files" manifest entry field for SST files | ||
This is a bit of a hack - we should consider how to better model SST files | ||
(e.g.: add Manifest entry of type "SST") and decide whether we also need to record data files separately | ||
even though they're referenced by SST | ||
Ticket: https://github.com/ray-project/deltacat/issues/469 | ||
""" | ||
for sst_uri in sst_files: | ||
entry_list.append( | ||
ManifestEntry.of( | ||
url=sst_uri, | ||
# TODO have rivulet writer populate these values | ||
# see: https://github.com/ray-project/deltacat/issues/476 | ||
meta=ManifestMeta.of( | ||
record_count=None, # or known | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aside: What does this comment mean? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it means that e.g. when writing data rivulet writer will know the record count and propagate that to manifests. Not updating code since I think the comment/linked ticket are sufficient |
||
content_length=None, | ||
content_type=None, | ||
content_encoding=None, | ||
entry_type=EntryType.DATA, | ||
), | ||
) | ||
) | ||
dc_manifest = Manifest.of(entries=entry_list) | ||
|
||
# Create delta and transaction which writes manifest to root | ||
# TODO replace this with higher level storage interface for deltacat | ||
|
||
delta_locator = DeltaLocator.at( | ||
namespace=None, | ||
table_name=None, | ||
table_version=None, | ||
stream_id=None, | ||
stream_format=None, | ||
partition_values=None, | ||
partition_id=None, | ||
# Using microsecond precision timestamp as stream position | ||
# TODO consider having storage interface auto assign stream position | ||
stream_position=time.time_ns(), | ||
Comment on lines
+152
to
+154
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: this appears to be nanosecond precision. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea. Not updating for now since will move automatic stream position generation behind storage/catalog interface |
||
) | ||
|
||
delta = Delta.of( | ||
locator=delta_locator, | ||
delta_type=DeltaType.APPEND, | ||
meta=None, | ||
properties={}, | ||
manifest=dc_manifest, | ||
) | ||
# TODO later support multiple schemas (https://github.com/ray-project/deltacat/issues/468) | ||
delta["schema"] = schema.to_dict() | ||
# TODO consider if level should be added as first class key to delta or | ||
# kept as specific to storage interface | ||
delta["level"] = level | ||
|
||
tx_results = Transaction.of( | ||
txn_type=TransactionType.APPEND, | ||
txn_operations=TransactionOperationList.of( | ||
[ | ||
TransactionOperation.of( | ||
operation_type=TransactionOperationType.CREATE, | ||
dest_metafile=delta, | ||
) | ||
] | ||
), | ||
).commit(self.root) | ||
paths = tx_results[0] | ||
assert ( | ||
len(paths) == 1 | ||
), "expected delta commit transaction to write exactly 1 metafile" | ||
Comment on lines
+182
to
+184
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point, the transaction has been committed. What (w|sh)ould be the right way to handle this AssertionError? (Is there a way for the caller to recover from such a scenario?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is more like a sanity check type exception - we only added one metafile and if the transaction committed more than one than something is seriously wrong We are abstracting the low level Tx handling behind storage interface in near future |
||
return paths[0] | ||
|
||
def read(self, file: str): | ||
delta = Delta.read(file) | ||
return RivuletDelta.of(delta) |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just added this to run tests more rapidly. Note that some of the unit tests for compaction take a while so we should consider moving them to benchmarks