Read the length of the datasource from the FileInstructions to limit I/O.

marcenacp · The TensorFlow Datasets Authors · commit 5db44d75be71 · 2025-03-19T03:11:28.000-07:00
PiperOrigin-RevId: 737687954
diff --git a/tensorflow_datasets/core/data_sources/array_record.py b/tensorflow_datasets/core/data_sources/array_record.py
@@ -56,7 +56,7 @@ class ArrayRecordDataSource(base.BaseDataSource):
   length: int = dataclasses.field(init=False)
 
   def __post_init__(self):
-    file_instructions = base.file_instructions(self.dataset_info, self.split)
+    file_instructions = self.split_info.file_instructions
     self.data_source = array_record_data_source.ArrayRecordDataSource(
         file_instructions
     )
diff --git a/tensorflow_datasets/core/data_sources/base.py b/tensorflow_datasets/core/data_sources/base.py
@@ -45,16 +45,6 @@ def __getitems__(self, keys: Iterable[int]) -> T:
     """Returns the value for the given `keys`."""
 
 
-def file_instructions(
-    dataset_info: dataset_info_lib.DatasetInfo,
-    split: splits_lib.Split | None = None,
-) -> list[shard_utils.FileInstruction]:
-  """Retrieves the file instructions from the DatasetInfo."""
-  split_infos = dataset_info.splits.values()
-  split_dict = splits_lib.SplitDict(split_infos=split_infos)
-  return split_dict[split].file_instructions
-
-
 @dataclasses.dataclass
 class BaseDataSource(MappingView, Sequence):
   """Base DataSource to override all dunder methods with the deserialization.
@@ -94,6 +84,16 @@ def _deserialize(self, record: Any) -> Any:
           return features.deserialize_example_np(record, decoders=self.decoders)  # pylint: disable=attribute-error
         raise ValueError('No features set, cannot decode example!')
 
+  @property
+  def split_info(self) -> splits_lib.SplitInfo | splits_lib.SubSplitInfo:
+    """Returns the SplitInfo for the split."""
+    splits = self.dataset_info.splits
+    if self.split not in splits:
+      raise ValueError(
+          f'Split {self.split} not found in dataset {self.dataset_info.name}!'
+      )
+    return splits[self.split]
+
   def __getitem__(self, key: SupportsIndex) -> Any:
     record = self.data_source[key.__index__()]
     return self._deserialize(record)
@@ -133,7 +133,7 @@ def __repr__(self) -> str:
     )
 
   def __len__(self) -> int:
-    return self.data_source.__len__()
+    return sum(fi.examples_in_shard for fi in self.split_info.file_instructions)
 
   def __iter__(self):
     for i in range(self.__len__()):
diff --git a/tensorflow_datasets/core/data_sources/parquet.py b/tensorflow_datasets/core/data_sources/parquet.py
@@ -57,7 +57,7 @@ class ParquetDataSource(base.BaseDataSource):
   """ParquetDataSource to read from a ParquetDataset."""
 
   def __post_init__(self):
-    file_instructions = base.file_instructions(self.dataset_info, self.split)
+    file_instructions = self.split_info.file_instructions
     filenames = [
         file_instruction.filename for file_instruction in file_instructions
     ]

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ class ArrayRecordDataSource(base.BaseDataSource):`
`56`	`56`	`length: int = dataclasses.field(init=False)`
`57`	`57`
`58`	`58`	`def __post_init__(self):`
`59`		`- file_instructions = base.file_instructions(self.dataset_info, self.split)`
	`59`	`+ file_instructions = self.split_info.file_instructions`
`60`	`60`	`self.data_source = array_record_data_source.ArrayRecordDataSource(`
`61`	`61`	`file_instructions`
`62`	`62`	`)`
Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ class ParquetDataSource(base.BaseDataSource):`
`57`	`57`	`"""ParquetDataSource to read from a ParquetDataset."""`
`58`	`58`
`59`	`59`	`def __post_init__(self):`
`60`		`- file_instructions = base.file_instructions(self.dataset_info, self.split)`
	`60`	`+ file_instructions = self.split_info.file_instructions`
`61`	`61`	`filenames = [`
`62`	`62`	`file_instruction.filename for file_instruction in file_instructions`
`63`	`63`	`]`