diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index b083748f..c6d394f0 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -27,14 +27,26 @@ jobs: poetry config virtualenvs.create false --local poetry install - name: test_general + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/general -v + pytest "learning_loop_node/tests/general" -v - name: test_detector + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/detector -v + pytest learning_loop_node/tests/detector -v - name: test_mock_detector + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest mock_detector -v + pytest mock_detector -v pytest_3_11: needs: @@ -62,14 +74,26 @@ jobs: poetry config virtualenvs.create false --local poetry install - name: test_annotator + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/annotator -v + pytest learning_loop_node/tests/annotator -v - name: test_trainer + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/trainer -v + pytest learning_loop_node/tests/trainer -v - name: test_mock_trainer + env: + LOOP_HOST: "preview.learning-loop.ai" + LOOP_USERNAME: "admin" + LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} run: | - LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest mock_trainer -v + pytest mock_trainer -v slack: needs: diff --git a/learning_loop_node/data_classes/__init__.py b/learning_loop_node/data_classes/__init__.py index b3061068..e280ee9b 100644 --- a/learning_loop_node/data_classes/__init__.py +++ b/learning_loop_node/data_classes/__init__.py @@ -5,8 +5,8 @@ ModelInformation, NodeState, NodeStatus) from .image_metadata import ImageMetadata from .socket_response import SocketResponse -from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData, - TrainingError, TrainingOut, TrainingStateData, TrainingStatus) +from .training import (Errors, PretrainedModel, TrainerState, Training, TrainingError, TrainingOut, TrainingStateData, + TrainingStatus) __all__ = [ 'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput', @@ -15,6 +15,6 @@ 'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration', 'ModelInformation', 'NodeState', 'NodeStatus', 'SocketResponse', - 'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData', + 'Errors', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus', ] diff --git a/learning_loop_node/data_classes/general.py b/learning_loop_node/data_classes/general.py index 087daada..22e38bd8 100644 --- a/learning_loop_node/data_classes/general.py +++ b/learning_loop_node/data_classes/general.py @@ -75,7 +75,7 @@ def load_from_disk(model_root_path: str) -> Optional['ModelInformation']: """ model_info_file_path = f'{model_root_path}/model.json' if not os.path.exists(model_info_file_path): - logging.warning(f"could not find model information file '{model_info_file_path}'") + logging.warning('could not find model information file %s', model_info_file_path) return None with open(model_info_file_path, 'r') as f: try: diff --git a/learning_loop_node/data_classes/training.py b/learning_loop_node/data_classes/training.py index d530ae7a..42f0d74b 100644 --- a/learning_loop_node/data_classes/training.py +++ b/learning_loop_node/data_classes/training.py @@ -4,46 +4,16 @@ from dataclasses import dataclass, field from enum import Enum from pathlib import Path -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional +from uuid import uuid4 +from ..helpers.misc import create_image_folder, create_training_folder # pylint: disable=no-name-in-module from .general import Category, Context KWONLY_SLOTS = {'kw_only': True, 'slots': True} if sys.version_info >= (3, 10) else {} -@dataclass(**KWONLY_SLOTS) -class Hyperparameter(): - resolution: int - flip_rl: bool - flip_ud: bool - - @staticmethod - def from_data(data: Dict): - return Hyperparameter( - resolution=data['resolution'], - flip_rl=data.get('flip_rl', False), - flip_ud=data.get('flip_ud', False) - ) - - -@dataclass(**KWONLY_SLOTS) -class TrainingData(): - image_data: List[Dict] = field(default_factory=list) - skipped_image_count: Optional[int] = 0 - categories: List[Category] = field(default_factory=list) - hyperparameter: Optional[Hyperparameter] = None - - def image_ids(self): - return [image['id'] for image in self.image_data] - - def train_image_count(self): - return len([image for image in self.image_data if image['set'] == 'train']) - - def test_image_count(self): - return len([image for image in self.image_data if image['set'] == 'test']) - - @dataclass(**KWONLY_SLOTS) class PretrainedModel(): name: str @@ -75,26 +45,21 @@ class TrainerState(str, Enum): class TrainingStatus(): id: str # NOTE this must not be changed, but tests wont detect a change -> update tests! name: str + state: Optional[str] - errors: Optional[Dict] uptime: Optional[float] + errors: Optional[Dict[str, Any]] progress: Optional[float] - train_image_count: Optional[int] = None - test_image_count: Optional[int] = None - skipped_image_count: Optional[int] = None pretrained_models: List[PretrainedModel] = field(default_factory=list) - hyperparameters: Optional[Dict] = None architecture: Optional[str] = None context: Optional[Context] = None def short_str(self) -> str: prgr = f'{self.progress * 100:.0f}%' if self.progress else '' - trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.' cntxt = f'{self.context.organization}/{self.context.project}' if self.context else '' - hyps = f'({self.hyperparameters})' if self.hyperparameters else '' arch = f'.{self.architecture} - ' if self.architecture else '' - return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]' + return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). {cntxt}{arch}]' @dataclass(**KWONLY_SLOTS) @@ -105,53 +70,83 @@ class Training(): project_folder: str # f'{GLOBALS.data_folder}/{context.organization}/{context.project}' images_folder: str # f'{project_folder}/images' training_folder: str # f'{project_folder}/trainings/{trainings_id}' + + categories: List[Category] + hyperparameters: dict + + training_number: int + training_state: str + model_variant: str # from `provided_pretrained_models->name` + start_time: float = field(default_factory=time.time) - # model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name) - base_model_uuid_or_name: Optional[str] = None + base_model_uuid: Optional[str] = None # model uuid to continue training (is loaded from loop) - data: Optional[TrainingData] = None - training_number: Optional[int] = None - training_state: Optional[str] = None - model_uuid_for_detecting: Optional[str] = None - hyperparameters: Optional[Dict] = None + # NOTE: these are set later after the model has been uploaded + image_data: Optional[List[dict]] = None + skipped_image_count: Optional[int] = None + model_uuid_for_detecting: Optional[str] = None # Model uuid to load from the loop after training and upload @property def training_folder_path(self) -> Path: return Path(self.training_folder) - def set_values_from_data(self, data: Dict) -> None: - self.data = TrainingData(categories=Category.from_list(data['categories'])) - self.data.hyperparameter = Hyperparameter.from_data(data=data) - self.training_number = data['training_number'] - self.base_model_uuid_or_name = data['id'] - self.training_state = TrainerState.Initialized + @classmethod + def generate_training(cls, project_folder: str, context: Context, data: Dict[str, Any]) -> 'Training': + if 'hyperparameters' not in data or not isinstance(data['hyperparameters'], dict): + raise ValueError('hyperparameters missing or not a dict') + if 'categories' not in data or not isinstance(data['categories'], list): + raise ValueError('categories missing or not a list') + if 'training_number' not in data or not isinstance(data['training_number'], int): + raise ValueError('training_number missing or not an int') + if 'model_variant' not in data or not isinstance(data['model_variant'], str): + raise ValueError('model_variant missing or not a str') + + training_uuid = str(uuid4()) + + return Training( + id=training_uuid, + context=context, + project_folder=project_folder, + images_folder=create_image_folder(project_folder), + training_folder=create_training_folder(project_folder, training_uuid), + categories=Category.from_list(data['categories']), + hyperparameters=data['hyperparameters'], + training_number=data['training_number'], + base_model_uuid=data.get('base_model_uuid', None), + model_variant=data['model_variant'], + training_state=TrainerState.Initialized.value + ) + + def image_ids(self) -> List[str]: + assert self.image_data is not None, 'Image data not set' + return [image['id'] for image in self.image_data] + + def train_image_count(self) -> int: + assert self.image_data is not None, 'Image data not set' + return len([image for image in self.image_data if image['set'] == 'train']) + + def test_image_count(self) -> int: + assert self.image_data is not None, 'Image data not set' + return len([image for image in self.image_data if image['set'] == 'test']) @dataclass(**KWONLY_SLOTS) class TrainingOut(): + trainer_id: str + trainer_name: Optional[str] = None confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics train_image_count: Optional[int] = None test_image_count: Optional[int] = None - trainer_id: Optional[str] = None - hyperparameters: Optional[Dict] = None + hyperparameters: Optional[Dict[str, Any]] = None + best_epoch: Optional[int] = None @dataclass(**KWONLY_SLOTS) class TrainingStateData(): confusion_matrix: Dict = field(default_factory=dict) meta_information: Dict = field(default_factory=dict) - - -@dataclass(**KWONLY_SLOTS) -class Model(): - uuid: str - confusion_matrix: Optional[Dict] = None - parent_id: Optional[str] = None - train_image_count: Optional[int] = None - test_image_count: Optional[int] = None - trainer_id: Optional[str] = None - hyperparameters: Optional[str] = None + epoch: Optional[int] = None class Errors(): diff --git a/learning_loop_node/data_exchanger.py b/learning_loop_node/data_exchanger.py index ad528df3..11e4c59e 100644 --- a/learning_loop_node/data_exchanger.py +++ b/learning_loop_node/data_exchanger.py @@ -62,7 +62,7 @@ def context(self) -> Context: async def fetch_image_uuids(self, query_params: Optional[str] = '') -> List[str]: """Fetch image uuids from the learning loop data endpoint.""" - logging.info(f'Fetching image uuids for {self.context.organization}/{self.context.project}..') + logging.info('Fetching image uuids for %s/%s..', self.context.organization, self.context.project) response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/data?{query_params}') assert response.status_code == 200, response @@ -70,7 +70,7 @@ async def fetch_image_uuids(self, query_params: Optional[str] = '') -> List[str] async def download_images_data(self, image_uuids: List[str], chunk_size: int = 100) -> List[Dict]: """Download image annotations, tags, set and other information for the given image uuids.""" - logging.info(f'Fetching annotations, tags, sets, etc. for {len(image_uuids)} images..') + logging.info('Fetching annotations, tags, sets, etc. for %s images..', len(image_uuids)) num_image_ids = len(image_uuids) if num_image_ids == 0: @@ -84,7 +84,7 @@ async def download_images_data(self, image_uuids: List[str], chunk_size: int = 1 chunk_ids = image_uuids[i:i+chunk_size] response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/images?ids={",".join(chunk_ids)}') if response.status_code != 200: - logging.error(f'Error {response.status_code} during downloading image data. Continue with next batch..') + logging.error('Error %s during downloading image data. Continue with next batch..', response.status_code) continue images_data += response.json()['images'] @@ -92,7 +92,7 @@ async def download_images_data(self, image_uuids: List[str], chunk_size: int = 1 async def download_images(self, image_uuids: List[str], image_folder: str, chunk_size: int = 10) -> None: """Downloads images (actual image data). Will skip existing images""" - logging.info(f'Downloading {len(image_uuids)} images (actual image data).. skipping existing images.') + logging.info('Downloading %s images (actual image data).. skipping existing images.', len(image_uuids)) if not image_uuids: return @@ -106,7 +106,7 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk self.progress = 1.0 return - logging.info(f'Downloading {num_new_image_ids} new images to {image_folder}..') + logging.info('Downloading %s new images to %s..', num_new_image_ids, image_folder) os.makedirs(image_folder, exist_ok=True) progress_factor = 0.5 / num_new_image_ids # second 50% of progress is for downloading images @@ -128,7 +128,7 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str) await asyncio.sleep(1) response = await self.loop_communicator.get(path) if response.status_code != HTTPStatus.OK: - logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}') + logging.error('bad status code %s for %s. Details: %s', response.status_code, path, response.text) return filename = f'{image_folder}/{image_id}.jpg' async with aiofiles.open(filename, 'wb') as f: @@ -171,7 +171,7 @@ async def download_model(self, target_folder: str, context: Context, model_uuid: created_files.append(new_file) shutil.rmtree(tmp_path, ignore_errors=True) - logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.') + logging.info('Downloaded model %s(%s) to %s.', model_uuid, model_format, target_folder) return created_files async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> str: @@ -182,10 +182,12 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini """ response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files) if response.status_code != 200: - logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}') + logging.error('Could not upload model for training %s, format %s: %s', + training_number, mformat, response.text) raise CriticalError( f'Could not upload model for training {training_number}, format {mformat}: {response.text}') uploaded_model = response.json() - logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}') + logging.info('Uploaded model for training %s, format %s. Response is: %s', + training_number, mformat, uploaded_model) return uploaded_model['id'] diff --git a/learning_loop_node/helpers/environment_reader.py b/learning_loop_node/helpers/environment_reader.py index 603e127b..4833e1b2 100644 --- a/learning_loop_node/helpers/environment_reader.py +++ b/learning_loop_node/helpers/environment_reader.py @@ -11,14 +11,14 @@ def read_from_env(possible_names: List[str], ignore_errors: bool = True) -> Opti # Possible error: no values are set if not values: if ignore_errors: - logging.warning(f'no environment variable set for {possible_names}') + logging.warning('no environment variable set for %s', possible_names) return None raise ValueError(f'no environment variable set for {possible_names}') # Possible error: multiple values are not None and not equal if len(values) > 1 and len(set(values)) > 1: if ignore_errors: - logging.warning(f'different environment variables set for {possible_names}: {values}') + logging.warning('different environment variables set for %s: %s', possible_names, values) return None raise ValueError(f'different environment variables set for {possible_names}: {values}') diff --git a/learning_loop_node/helpers/log_conf.py b/learning_loop_node/helpers/log_conf.py index 3f9799bb..2bd2fa72 100644 --- a/learning_loop_node/helpers/log_conf.py +++ b/learning_loop_node/helpers/log_conf.py @@ -2,7 +2,7 @@ LOGGING_CONF = { 'version': 1, - 'disable_existing_loggers': True, # to make sure this config is used + 'disable_existing_loggers': False, # to make sure this config is used 'formatters': { 'default': { 'format': '%(asctime)s,%(msecs)01d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', @@ -34,3 +34,6 @@ def init(): logging.config.dictConfig(LOGGING_CONF) + + +init() diff --git a/learning_loop_node/helpers/misc.py b/learning_loop_node/helpers/misc.py index aea20e60..c0fac873 100644 --- a/learning_loop_node/helpers/misc.py +++ b/learning_loop_node/helpers/misc.py @@ -14,7 +14,8 @@ import pynvml -from ..data_classes import Context, SocketResponse, Training +from ..data_classes.general import Context +from ..data_classes.socket_response import SocketResponse from ..globals import GLOBALS T = TypeVar('T') @@ -81,11 +82,11 @@ async def delete_corrupt_images(image_folder: str, check_jpeg: bool = False) -> n_deleted = 0 for image in glob(f'{image_folder}/*.jpg'): if not await is_valid_image(image, check_jpeg): - logging.debug(f' deleting image {image}') + logging.debug(' deleting image %s', image) os.remove(image) n_deleted += 1 - logging.info(f'deleted {n_deleted} images') + logging.info('deleted %s images', n_deleted) def create_resource_paths(organization_name: str, project_name: str, image_ids: List[str]) -> Tuple[List[str], List[str]]: @@ -144,7 +145,7 @@ async def wrapper_ensure_socket_response(*args, **kwargs): raise Exception( f"Return type for sio must be str, bool, SocketResponse or None', but was {type(value)}'") except Exception as e: - logging.exception(f'An error occured for {args[0]}') + logging.exception('An error occured for %s', args[0]) return asdict(SocketResponse.for_failure(str(e))) @@ -183,26 +184,15 @@ def activate_asyncio_warnings() -> None: def images_for_ids(image_ids, image_folder) -> List[str]: - logging.info(f'### Going to get images for {len(image_ids)} images ids') + logging.info('### Going to get images for %s images ids', len(image_ids)) start = perf_counter() images = [img for img in glob(f'{image_folder}/**/*.*', recursive=True) if os.path.splitext(os.path.basename(img))[0] in image_ids] end = perf_counter() - logging.info(f'found {len(images)} images for {len(image_ids)} image ids, which took {end-start:0.2f} seconds') + logging.info('found %s images for %s image ids, which took %.2f seconds', len(images), len(image_ids), end-start) return images -def generate_training(project_folder: str, context: Context) -> Training: - training_uuid = str(uuid4()) - return Training( - id=training_uuid, - context=context, - project_folder=project_folder, - images_folder=create_image_folder(project_folder), - training_folder=create_training_folder(project_folder, training_uuid) - ) - - def delete_all_training_folders(project_folder: str): if not os.path.exists(f'{project_folder}/trainings'): return diff --git a/learning_loop_node/loop_communication.py b/learning_loop_node/loop_communication.py index 7f44cb26..63eb4395 100644 --- a/learning_loop_node/loop_communication.py +++ b/learning_loop_node/loop_communication.py @@ -94,8 +94,7 @@ async def get(self, path: str, requires_login: bool = True, api_prefix: str = '/ if requires_login: await self.ensure_login() return await self.retry_on_401(self._get, path, api_prefix) - else: - return await self._get(path, api_prefix) + return await self._get(path, api_prefix) async def _get(self, path: str, api_prefix: str) -> httpx.Response: return await self.async_client.get(api_prefix+path) @@ -104,8 +103,7 @@ async def put(self, path: str, files: Optional[List[str]] = None, requires_login if requires_login: await self.ensure_login() return await self.retry_on_401(self._put, path, files, api_prefix, **kwargs) - else: - return await self._put(path, files, api_prefix, **kwargs) + return await self._put(path, files, api_prefix, **kwargs) async def _put(self, path: str, files: Optional[List[str]], api_prefix: str, **kwargs) -> httpx.Response: if files is None: @@ -133,8 +131,7 @@ async def post(self, path: str, requires_login: bool = True, api_prefix: str = ' if requires_login: await self.ensure_login() return await self.retry_on_401(self._post, path, api_prefix, **kwargs) - else: - return await self._post(path, api_prefix, **kwargs) + return await self._post(path, api_prefix, **kwargs) async def _post(self, path, api_prefix='/api', **kwargs) -> httpx.Response: return await self.async_client.post(api_prefix+path, **kwargs) @@ -143,8 +140,7 @@ async def delete(self, path: str, requires_login: bool = True, api_prefix: str = if requires_login: await self.ensure_login() return await self.retry_on_401(self._delete, path, api_prefix, **kwargs) - else: - return await self._delete(path, api_prefix, **kwargs) + return await self._delete(path, api_prefix, **kwargs) async def _delete(self, path, api_prefix, **kwargs) -> httpx.Response: return await self.async_client.delete(api_prefix+path, **kwargs) diff --git a/learning_loop_node/node.py b/learning_loop_node/node.py index b4c7c4af..318eb690 100644 --- a/learning_loop_node/node.py +++ b/learning_loop_node/node.py @@ -1,3 +1,10 @@ + +# NOTE: log_conf is imported first to initialize the loggers before they are created +from .helpers import log_conf # pylint: disable=unused-import + +# isort: split +# pylint: disable=wrong-import-order,ungrouped-imports + import asyncio import logging import ssl @@ -14,7 +21,6 @@ from .data_classes import NodeStatus from .data_exchanger import DataExchanger -from .helpers import log_conf from .helpers.misc import ensure_socket_response, read_or_create_uuid from .loop_communication import LoopCommunicator from .rest import router @@ -39,7 +45,6 @@ def __init__(self, name: str, uuid: Optional[str] = None, node_type: str = 'node """ super().__init__(lifespan=self.lifespan) - log_conf.init() self.name = name self.uuid = uuid or read_or_create_uuid(self.name) @@ -98,13 +103,14 @@ async def lifespan(self, app: FastAPI): # pylint: disable=unused-argument pass async def _on_startup(self): - self.log.info('received "startup" lifecycle-event') + self.log.info('received "startup" lifecycle-event - connecting to loop') try: await self.reconnect_to_loop() except Exception: self.log.warning('Could not establish sio connection to loop during startup') - self.log.info('done') + self.log.info('successfully connected to loop - calling on_startup') await self.on_startup() + self.log.info('successfully finished on_startup') async def _on_shutdown(self): self.log.info('received "shutdown" lifecycle-event') diff --git a/learning_loop_node/rest.py b/learning_loop_node/rest.py index 91297e95..3dc9fb7e 100644 --- a/learning_loop_node/rest.py +++ b/learning_loop_node/rest.py @@ -1,4 +1,5 @@ import logging +from logging import Logger from typing import TYPE_CHECKING from fastapi import APIRouter, HTTPException, Request @@ -20,14 +21,15 @@ async def _debug_logging(request: Request) -> str: ''' state = str(await request.body(), 'utf-8') node: 'Node' = request.app + log: Logger = node.log # type: ignore if state == 'off': logger.info('turning debug logging off') - node.log.setLevel('INFO') + log.setLevel('INFO') return 'off' if state == 'on': logger.info('turning debug logging on') - node.log.setLevel('DEBUG') + log.setLevel('DEBUG') return 'on' raise HTTPException(status_code=400, detail='Invalid state') diff --git a/learning_loop_node/tests/detector/conftest.py b/learning_loop_node/tests/detector/conftest.py index 86b4d0a4..9a298364 100644 --- a/learning_loop_node/tests/detector/conftest.py +++ b/learning_loop_node/tests/detector/conftest.py @@ -118,34 +118,30 @@ def get_outbox_files(outbox: Outbox): return [file for file in files if os.path.isfile(file)] -@pytest.fixture -def mock_detector_logic(): - class MockDetectorLogic(DetectorLogic): # pylint: disable=abstract-method - def __init__(self): - super().__init__('mock') - self.image_metadata = ImageMetadata( - box_detections=[BoxDetection(category_name="test", - category_id="1", - confidence=0.9, - x=0, y=0, width=10, height=10, - model_name="mock", - )]) - - @property - def is_initialized(self): - return True +class MockDetectorLogic(DetectorLogic): # pylint: disable=abstract-method + def __init__(self): + super().__init__('mock') + self.image_metadata = ImageMetadata( + box_detections=[BoxDetection(category_name="test", + category_id="1", + confidence=0.9, + x=0, y=0, width=10, height=10, + model_name="mock", + )]) - def evaluate_with_all_info(self, image: np.ndarray, tags: List[str], source: Optional[str] = None, creation_date: Optional[str] = None): - return self.image_metadata + @property + def is_initialized(self): + return True - return MockDetectorLogic() + def evaluate_with_all_info(self, image: np.ndarray, tags: List[str], source: Optional[str] = None, creation_date: Optional[str] = None): + return self.image_metadata @pytest.fixture -def detector_node(mock_detector_logic): +def detector_node(): os.environ['LOOP_ORGANIZATION'] = 'test_organization' os.environ['LOOP_PROJECT'] = 'test_project' - return DetectorNode(name="test_node", detector=mock_detector_logic) + return DetectorNode(name="test_node", detector=MockDetectorLogic()) # ====================================== REDUNDANT FIXTURES IN ALL CONFTESTS ! ====================================== diff --git a/learning_loop_node/tests/trainer/conftest.py b/learning_loop_node/tests/trainer/conftest.py index 2ec606f0..9a9898b3 100644 --- a/learning_loop_node/tests/trainer/conftest.py +++ b/learning_loop_node/tests/trainer/conftest.py @@ -30,12 +30,15 @@ async def test_initialized_trainer_node(): node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000') trainer._node = node trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) + training_config={'categories': [], + 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project + 'training_number': 0, + 'model_variant': '', + 'hyperparameters': { + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False} + }) await node._on_startup() yield node await node._on_shutdown() @@ -50,12 +53,15 @@ async def test_initialized_trainer(): await node._on_startup() trainer._node = node trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'), - details={'categories': [], - 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project - 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False}) + training_config={'categories': [], + 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project + 'training_number': 0, + 'model_variant': '', + 'hyperparameters': { + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False} + }) yield trainer try: await node._on_shutdown() diff --git a/learning_loop_node/tests/trainer/states/test_state_download_train_model.py b/learning_loop_node/tests/trainer/states/test_state_download_train_model.py index 82e2c11f..2de6c52f 100644 --- a/learning_loop_node/tests/trainer/states/test_state_download_train_model.py +++ b/learning_loop_node/tests/trainer/states/test_state_download_train_model.py @@ -3,6 +3,7 @@ import os from ....data_classes import TrainerState +from ... import test_helper from ..state_helper import assert_training_state, create_active_training_file from ..testing_trainer_logic import TestingTrainerLogic @@ -11,9 +12,12 @@ async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - create_active_training_file(trainer, training_state=TrainerState.DataDownloaded) - trainer.model_format = 'mocked' + model_id = await test_helper.get_latest_model_id(project='demo') + create_active_training_file(trainer, + base_model_uuid=model_id, + training_state=TrainerState.DataDownloaded) + trainer._init_from_last_training() asyncio.get_running_loop().create_task( @@ -50,7 +54,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer create_active_training_file(trainer, training_state=TrainerState.DataDownloaded, - base_model_uuid_or_name='00000000-0000-0000-0000-000000000000') # bad model id) + base_model_uuid='00000000-0000-0000-0000-000000000000') # bad model id) trainer._init_from_last_training() trainer._begin_training_task() diff --git a/learning_loop_node/tests/trainer/states/test_state_prepare.py b/learning_loop_node/tests/trainer/states/test_state_prepare.py index 59fb9c91..7c926399 100644 --- a/learning_loop_node/tests/trainer/states/test_state_prepare.py +++ b/learning_loop_node/tests/trainer/states/test_state_prepare.py @@ -20,7 +20,6 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare) assert trainer_has_prepare_error(trainer) is False assert trainer.training.training_state == TrainerState.DataDownloaded - assert trainer.training.data is not None assert trainer.node.last_training_io.load() == trainer.training diff --git a/learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py b/learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py index 5a6ad4b4..55b8e9c4 100644 --- a/learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py +++ b/learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py @@ -19,7 +19,7 @@ def trainer_has_sync_confusion_matrix_error(trainer: TrainerLogic): async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic): trainer = test_initialized_trainer - # TODO this requires trainer to have _training + # NOTE: this requires trainer to have _training # trainer.load_active_training() create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer._init_from_last_training() @@ -40,6 +40,7 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine create_active_training_file(trainer, training_state=TrainerState.TrainingFinished) trainer._init_from_last_training() + trainer.training.image_data = [] trainer.has_new_model = True trainer._begin_training_task() diff --git a/learning_loop_node/tests/trainer/states/test_state_train.py b/learning_loop_node/tests/trainer/states/test_state_train.py index ec126e03..536923e4 100644 --- a/learning_loop_node/tests/trainer/states/test_state_train.py +++ b/learning_loop_node/tests/trainer/states/test_state_train.py @@ -1,5 +1,3 @@ -import asyncio - from ....data_classes import TrainerState from ...test_helper import condition from ..state_helper import assert_training_state, create_active_training_file diff --git a/learning_loop_node/tests/trainer/test_trainer_states.py b/learning_loop_node/tests/trainer/test_trainer_states.py index 2215e57a..9910f085 100644 --- a/learning_loop_node/tests/trainer/test_trainer_states.py +++ b/learning_loop_node/tests/trainer/test_trainer_states.py @@ -14,7 +14,12 @@ def create_training() -> Training: context=context, project_folder='', images_folder='', - training_folder='') + training_folder='', + categories=[], + hyperparameters={}, + model_variant='', + training_number=0, + training_state=TrainerState.Preparing) return training diff --git a/learning_loop_node/tests/trainer/testing_trainer_logic.py b/learning_loop_node/tests/trainer/testing_trainer_logic.py index a8d435ea..b3f56a9d 100644 --- a/learning_loop_node/tests/trainer/testing_trainer_logic.py +++ b/learning_loop_node/tests/trainer/testing_trainer_logic.py @@ -30,13 +30,13 @@ def provided_pretrained_models(self) -> List[PretrainedModel]: PretrainedModel(name='large', label='Large', description='a large model')] # pylint: disable=unused-argument - async def _start_training_from_base_model(self, model: str = 'model.model') -> None: + async def _start_training_from_base_model(self) -> None: assert self._executor is not None await self._executor.start('/bin/bash -c "while true; do sleep 1; done"') async def _start_training_from_scratch(self) -> None: - assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set' - await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt') + assert self._executor is not None + await self._executor.start('/bin/bash -c "while true; do sleep 1; done"') def _get_new_best_training_state(self) -> Optional[TrainingStateData]: if self.has_new_model: diff --git a/learning_loop_node/trainer/downloader.py b/learning_loop_node/trainer/downloader.py index 7c8ab10e..fdb884c9 100644 --- a/learning_loop_node/trainer/downloader.py +++ b/learning_loop_node/trainer/downloader.py @@ -27,5 +27,5 @@ async def download_images_and_annotations(self, image_ids: List[str], image_fold valid_image_data.append(i) else: skipped_image_count += 1 - logging.info(f'Done downloading image data for {len(image_data)} images.') + logging.info('Done downloading image data for %s images.', len(image_data)) return (valid_image_data, skipped_image_count) diff --git a/learning_loop_node/trainer/executor.py b/learning_loop_node/trainer/executor.py index 8286b50d..d128d9b5 100644 --- a/learning_loop_node/trainer/executor.py +++ b/learning_loop_node/trainer/executor.py @@ -3,7 +3,7 @@ import os import shlex from io import BufferedWriter -from typing import List, Optional, Dict +from typing import Dict, List, Optional class Executor: @@ -33,7 +33,7 @@ async def start(self, cmd: str, env: Optional[Dict[str, str]] = None) -> None: if env is not None: full_env.update(env) - logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}') + logging.info('Starting executor with command: %s in %s - logging to %s', cmd, self.path, self.log_file_path) self.log_file = open(self.log_file_path, 'ab') self._process = await asyncio.create_subprocess_exec( diff --git a/learning_loop_node/trainer/rest/backdoor_controls.py b/learning_loop_node/trainer/rest/backdoor_controls.py index 54ed4125..6e4efbc2 100644 --- a/learning_loop_node/trainer/rest/backdoor_controls.py +++ b/learning_loop_node/trainer/rest/backdoor_controls.py @@ -29,7 +29,7 @@ async def provide_new_model(request: Request): if value == 'on': trainer_node.trainer_logic.provide_new_model = True # type: ignore - logging.debug(f'turning automatically provide_new_model {value}') + logging.debug('turning automatically provide_new_model %s', value) @router.post("/reset") @@ -64,7 +64,7 @@ def set_error_configuration(msg: Dict, request: Request): get_new_model=msg.get('get_new_model', None), save_model=msg.get('save_model', None), ) - logging.info(f'setting error configuration to: {asdict(error_configuration)}') + logging.info('setting error configuration to: %s', asdict(error_configuration)) trainer_logic = request.app.trainer_logic # NOTE: trainer_logic is MockTrainerLogic which has a property error_configuration @@ -82,23 +82,23 @@ async def add_steps(request: Request): if not trainer_logic._executor or not trainer_logic._executor.is_running(): # pylint: disable=protected-access training = trainer_logic._training # pylint: disable=protected-access - logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}') + logging.error('cannot add steps when not running, state: %s', training.training_state if training else 'None') raise HTTPException(status_code=409, detail="trainer is not running") steps = int(str(await request.body(), 'utf-8')) previous_state = trainer_logic.provide_new_model # type: ignore trainer_logic.provide_new_model = True # type: ignore - logging.warning(f'simulating newly completed models by moving {steps} forward') + logging.warning('simulating newly completed models by moving %s forward', steps) for _ in range(steps): try: logging.warning('calling sync_confusion_matrix') - await trainer_logic._sync_confusion_matrix() # pylint: disable=protected-access + await trainer_logic._sync_training() # pylint: disable=protected-access except Exception: pass # Tests can force synchroniation to fail, error state is reported to backend trainer_logic.provide_new_model = previous_state # type: ignore - logging.warning(f'progress increased to {trainer_logic.current_iteration}') # type: ignore + logging.warning('progress increased to %s', trainer_logic.current_iteration) # type: ignore await trainer_node.send_status() diff --git a/learning_loop_node/trainer/trainer_logic.py b/learning_loop_node/trainer/trainer_logic.py index 13d8dd17..9392c0c9 100644 --- a/learning_loop_node/trainer/trainer_logic.py +++ b/learning_loop_node/trainer/trainer_logic.py @@ -62,7 +62,7 @@ async def _train(self) -> None: break self.errors.reset(error_key) try: - await self._sync_confusion_matrix() + await self._sync_training() except asyncio.CancelledError: logging.warning('CancelledError in run_training') raise @@ -130,8 +130,12 @@ async def _start_training(self): if self._can_resume(): self.start_training_task = self._resume() else: - base_model_uuid_or_name = self.training.base_model_uuid_or_name - if not is_valid_uuid4(base_model_uuid_or_name): + base_model_uuid_is_none = self.training.base_model_uuid is None + base_model_uuid_is_valid = is_valid_uuid4(self.training.base_model_uuid) + if not base_model_uuid_is_none and not base_model_uuid_is_valid: + logging.warning('base_model_uuid is not a valid uuid4: %s\n Starting training from scratch.', + self.training.base_model_uuid) + if not base_model_uuid_is_valid: self.start_training_task = self._start_training_from_scratch() else: self.start_training_task = self._start_training_from_base_model() diff --git a/learning_loop_node/trainer/trainer_logic_generic.py b/learning_loop_node/trainer/trainer_logic_generic.py index 8f99b937..9cd8d8ff 100644 --- a/learning_loop_node/trainer/trainer_logic_generic.py +++ b/learning_loop_node/trainer/trainer_logic_generic.py @@ -10,9 +10,9 @@ from fastapi.encoders import jsonable_encoder -from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData, - TrainingOut, TrainingStateData) -from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4 +from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingOut, TrainingStateData, + TrainingStatus) +from ..helpers.misc import create_project_folder, delete_all_training_folders, is_valid_uuid4 from .downloader import TrainingsDownloader from .exceptions import CriticalError, NodeNeedsRestartError from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO @@ -66,19 +66,12 @@ def training(self) -> Training: return self._training @property - def hyperparameter(self) -> Hyperparameter: - assert self.training_data is not None, 'Training should have data' - assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter' - return self.training_data.hyperparameter + def hyperparameters(self) -> dict: + assert self._training is not None, 'Training should have data' + return self._training.hyperparameters # ---------------------------------------- PROPERTIES ---------------------------------------- - @property - def training_data(self) -> Optional[TrainingData]: - if self.training_active and self.training.data: - return self.training.data - return None - @property def training_context(self) -> Optional[Context]: if self.training_active: @@ -111,12 +104,8 @@ def training_uptime(self) -> Optional[float]: def hyperparameters_for_state_sync(self) -> Optional[Dict]: """Used in sync_confusion_matrix and send_status to provide information about the training configuration. """ - if self._training and self._training.data and self._training.data.hyperparameter: - information = {} - information['resolution'] = self._training.data.hyperparameter.resolution - information['flipRl'] = self._training.data.hyperparameter.flip_rl - information['flipUd'] = self._training.data.hyperparameter.flip_ud - return information + if self._training: + return self._training.hyperparameters return None @property @@ -173,6 +162,24 @@ def provided_pretrained_models(self) -> List[PretrainedModel]: # Initializing a new training object will create the folder structure for the training. # The training loop will then run through the states of the training. + def generate_status_for_loop(self, trainer_uuid: str, trainer_name: str) -> TrainingStatus: + + status = TrainingStatus(id=trainer_uuid, + name=trainer_name, + state=self.state, + errors={}, + uptime=self.training_uptime, + progress=self.general_progress) + + status.pretrained_models = self.provided_pretrained_models + status.architecture = self.model_architecture + + if self._training: + status.errors = self.errors.errors + status.context = self.training_context + + return status + async def try_continue_run_if_incomplete(self) -> bool: """Tries to continue a training if the last training was not finished. """ @@ -188,29 +195,30 @@ def _init_from_last_training(self) -> None: """ self._training = self.last_training_io.load() assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder' + logger.info('restored training: \n%s', self._training) self._active_training_io = ActiveTrainingIO( self._training.training_folder, self.node.loop_communicator, self._training.context) - async def begin_training(self, organization: str, project: str, details: Dict) -> None: + async def begin_training(self, organization: str, project: str, training_config: Dict) -> None: """Called on `begin_training` event from the Learning Loop. """ - self._init_new_training(Context(organization=organization, project=project), details) + self._init_new_training(Context(organization=organization, project=project), training_config) self._begin_training_task() def _begin_training_task(self) -> None: # NOTE: Task object is used to potentially cancel the task self.training_task = asyncio.get_event_loop().create_task(self._run()) - def _init_new_training(self, context: Context, details: Dict) -> None: + def _init_new_training(self, context: Context, training_config: Dict) -> None: """Called on `begin_training` event from the Learning Loop. - Note that details needs the entries 'categories' and 'training_number', + Note that training_config needs the entries 'categories', 'model_variant' and 'training_number', but also the hyperparameter entries. + 'base_model_uuid' is optional if the training is continued from a previous training. """ project_folder = create_project_folder(context) if not self._environment_vars.keep_old_trainings: delete_all_training_folders(project_folder) - self._training = generate_training(project_folder, context) - self._training.set_values_from_data(details) + self._training = Training.generate_training(project_folder, context, training_config) self._active_training_io = ActiveTrainingIO( self._training.training_folder, self.node.loop_communicator, context) @@ -254,7 +262,7 @@ async def _training_loop(self) -> None: elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train) elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced - await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix) + await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_training) elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model) elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected @@ -298,6 +306,7 @@ async def _perform_state(self, error_key: str, state_during: TrainerState, state logger.error('Node Restart Requested') sys.exit(0) except Exception as e: + print('Error in %s - Exception: %s', state_during, e, flush=True) self.errors.set(error_key, str(e)) logger.exception('Error in %s - Exception: %s', state_during, e) self.training.training_state = previous_state @@ -316,19 +325,25 @@ async def _prepare(self) -> None: self.node.data_exchanger.set_context(self.training.context) downloader = TrainingsDownloader(self.node.data_exchanger) image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder) - assert self.training.data is not None, 'training.data must be set' - self.training.data.image_data = image_data - self.training.data.skipped_image_count = skipped_image_count + + self.training.image_data = image_data + self.training.skipped_image_count = skipped_image_count async def _download_model(self) -> None: """If training is continued, the model is downloaded from the Learning Loop to the training_folder. The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training. """ - base_model_uuid = self.training.base_model_uuid_or_name + base_model_uuid = self.training.base_model_uuid + base_model_uuid_is_none = base_model_uuid is None + base_model_uuid_is_valid = is_valid_uuid4(base_model_uuid) + + if not base_model_uuid_is_none and not base_model_uuid_is_valid: + logger.warning( + 'base model uuid was provided but was not valid (base_model_uuid: %s).\nSkipping download and starting training from scratch.', base_model_uuid) + return - # TODO this checks if we continue a training -> make more explicit - if not base_model_uuid or not is_valid_uuid4(base_model_uuid): - logger.info('skipping model download. No base model provided (in form of uuid): %s', base_model_uuid) + if base_model_uuid_is_none: + logger.info('No base model provided (base_model_uuid: %s).\nStarting training from scratch.', base_model_uuid) return logger.info('loading model from Learning Loop') @@ -337,19 +352,21 @@ async def _download_model(self) -> None: shutil.move(f'{self.training.training_folder}/model.json', f'{self.training.training_folder}/base_model.json') - async def _sync_confusion_matrix(self) -> None: - """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint. + async def _sync_training(self) -> None: + """Syncronizes the training with the Learning Loop via the update_training endpoint. NOTE: This stage sets the errors explicitly because it may be used inside the training stage. """ error_key = 'sync_confusion_matrix' try: new_best_model = self._get_new_best_training_state() - if new_best_model and self.training.data: + if new_best_model: new_training = TrainingOut(trainer_id=self.node.uuid, + trainer_name=self.node.name, confusion_matrix=new_best_model.confusion_matrix, - train_image_count=self.training.data.train_image_count(), - test_image_count=self.training.data.test_image_count(), - hyperparameters=self.hyperparameters_for_state_sync) + train_image_count=self.training.train_image_count(), + test_image_count=self.training.test_image_count(), + hyperparameters=self.hyperparameters_for_state_sync, + best_epoch=new_best_model.epoch) await asyncio.sleep(0.1) # NOTE needed for tests. result = await self.node.sio_client.call('update_training', ( @@ -411,7 +428,7 @@ async def _upload_model_return_new_model_uuid(self, context: Context) -> str: def _dump_categories_to_json(self) -> str: """Dumps the categories to a json file and returns the path to the file. """ - content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None + content = {'categories': [asdict(c) for c in self._training.categories], } if self._training else None json_path = '/tmp/model.json' with open(json_path, 'w') as f: json.dump(content, f) @@ -481,12 +498,13 @@ async def _do_detections(self) -> None: @abstractmethod def _get_new_best_training_state(self) -> Optional[TrainingStateData]: - """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe. + """Is called frequently by `_sync_training` during training to check if a new "best" model is availabe. Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information). `confusion_matrix` contains a dict of all classes: - The classes must be identified by their uuid, not their name. - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives). `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files + `epoch` is the epoch number of the best model. """ raise NotImplementedError diff --git a/learning_loop_node/trainer/trainer_node.py b/learning_loop_node/trainer/trainer_node.py index 884a82ee..fd059da8 100644 --- a/learning_loop_node/trainer/trainer_node.py +++ b/learning_loop_node/trainer/trainer_node.py @@ -7,7 +7,6 @@ from fastapi.encoders import jsonable_encoder from socketio import AsyncClient, exceptions -from ..data_classes import TrainingStatus from ..node import Node from .io_helpers import LastTrainingIO from .rest import backdoor_controls @@ -23,14 +22,15 @@ def __init__(self, name: str, trainer_logic: TrainerLogicGeneric, uuid: Optional self.last_training_io = LastTrainingIO(self.uuid) self.trainer_logic._last_training_io = self.last_training_io - self.first_idle_time: float | None = None + self._first_idle_time: float | None = None if os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0): - self.idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0)) + self._idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0)) else: - self.idle_timeout = 0.0 - if self.idle_timeout: + self._idle_timeout = 0.0 + if self._idle_timeout: self.log.info( - f'Trainer started with an idle_timeout of {self.idle_timeout} seconds. Note that shutdown does not work if docker container has the restart policy set to always') + 'Trainer started with an idle_timeout of %s seconds. Note that shutdown does not work if docker container has the restart policy set to always', + self._idle_timeout) if use_backdoor_controls or os.environ.get('USE_BACKDOOR_CONTROLS', '0').lower() in ('1', 'true'): self.include_router(backdoor_controls.router, tags=["controls"]) @@ -53,8 +53,8 @@ async def on_repeat(self): except exceptions.TimeoutError: self.log.warning('timeout when sending status to learning loop, reconnecting sio_client') await self.sio_client.disconnect() # NOTE: reconnect happens in node._on_repeat - except Exception as e: - self.log.exception(f'could not send status state: {e}') + except Exception: + self.log.exception('could not send status. Exception:') # ---------------------------------------------- NODE METHODS --------------------------------------------------- @@ -68,7 +68,7 @@ async def begin_training(organization: str, project: str, details: Dict): @sio_client.event async def stop_training(): - self.log.info(f'stop_training received. Current state : {self.status.state}') + self.log.info('stop_training received. Current state : %s', self.trainer_logic.state) try: await self.trainer_logic.stop() except Exception: @@ -80,24 +80,7 @@ async def send_status(self): self.log.debug('cannot send status - not connected to the Learning Loop') return - status = TrainingStatus(id=self.uuid, - name=self.name, - state=self.trainer_logic.state, - errors={}, - uptime=self.trainer_logic.training_uptime, - progress=self.trainer_logic.general_progress) - - status.pretrained_models = self.trainer_logic.provided_pretrained_models - status.architecture = self.trainer_logic.model_architecture - - if data := self.trainer_logic.training_data: - status.train_image_count = data.train_image_count() - status.test_image_count = data.test_image_count() - status.skipped_image_count = data.skipped_image_count - status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync - status.errors = self.trainer_logic.errors.errors - status.context = self.trainer_logic.training_context - + status = self.trainer_logic.generate_status_for_loop(self.uuid, self.name) self.log.debug('sending status: %s', status.short_str()) result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30) if isinstance(result, Dict) and not result['success']: @@ -105,17 +88,17 @@ async def send_status(self): self.log.error('Error when sending status update: Response from loop was:\n %s', result) def check_idle_timeout(self): - if not self.idle_timeout: + if not self._idle_timeout: return if self.trainer_logic.state == 'idle': - if self.first_idle_time is None: - self.first_idle_time = time.time() - idle_time = time.time() - self.first_idle_time - if idle_time > self.idle_timeout: + if self._first_idle_time is None: + self._first_idle_time = time.time() + idle_time = time.time() - self._first_idle_time + if idle_time > self._idle_timeout: self.log.info('Trainer has been idle for %.2f s (with timeout %.2f s). Shutting down.', - idle_time, self.idle_timeout) + idle_time, self._idle_timeout) sys.exit(0) - self.log.debug('idle time: %.2f s / %.2f s', idle_time, self.idle_timeout) + self.log.debug('idle time: %.2f s / %.2f s', idle_time, self._idle_timeout) else: - self.first_idle_time = None + self._first_idle_time = None diff --git a/mock_detector/app_code/tests/test_detector.py b/mock_detector/app_code/tests/test_detector.py index 19719fc9..e5b7680e 100644 --- a/mock_detector/app_code/tests/test_detector.py +++ b/mock_detector/app_code/tests/test_detector.py @@ -3,7 +3,6 @@ import pytest -from learning_loop_node.detector.detector_node import DetectorNode from learning_loop_node.globals import GLOBALS # pylint: disable=unused-argument @@ -14,7 +13,7 @@ @pytest.fixture(scope="session") -def event_loop(request): +def event_loop(): """https://stackoverflow.com/a/66225169/4082686 Create an instance of the default event loop for each test case. Prevents 'RuntimeError: Event loop is closed' @@ -29,7 +28,8 @@ def test_assert_data_folder_for_tests(): assert GLOBALS.data_folder.startswith('/tmp') -async def test_sio_detect(test_detector_node: DetectorNode, sio): +@pytest.mark.usefixtures('test_detector_node') +async def test_sio_detect(sio): with open(test_image_path, 'rb') as f: image_bytes = f.read() diff --git a/mock_trainer/app_code/progress_simulator.py b/mock_trainer/app_code/progress_simulator.py index 76f8be52..97a3c345 100644 --- a/mock_trainer/app_code/progress_simulator.py +++ b/mock_trainer/app_code/progress_simulator.py @@ -6,12 +6,11 @@ def increment_time(trainer: TrainerLogic, latest_known_confusion_matrix: Dict) -> Optional[TrainingStateData]: - if not trainer._training or not trainer._training.data: # pylint: disable=protected-access + if not trainer._training: # pylint: disable=protected-access return None confusion_matrix = {} - assert trainer.training.data is not None - for category in trainer.training.data.categories: + for category in trainer.training.categories: try: minimum = latest_known_confusion_matrix[category.id]['tp'] except Exception: diff --git a/mock_trainer/app_code/tests/test_detections.py b/mock_trainer/app_code/tests/test_detections.py index 1d0dd3a9..b56873c6 100644 --- a/mock_trainer/app_code/tests/test_detections.py +++ b/mock_trainer/app_code/tests/test_detections.py @@ -1,21 +1,20 @@ from dataclasses import asdict from glob import glob +# pylint: disable=protected-access,redefined-outer-name,unused-argument +import pytest from fastapi.encoders import jsonable_encoder from learning_loop_node.data_classes import Category, Context from learning_loop_node.globals import GLOBALS -from learning_loop_node.helpers.misc import create_project_folder, generate_training -from learning_loop_node.loop_communication import LoopCommunicator from learning_loop_node.tests import test_helper from learning_loop_node.trainer.trainer_node import TrainerNode from ..mock_trainer_logic import MockTrainerLogic -# pylint: disable=protected-access,redefined-outer-name,unused-argument - -async def test_all(setup_test_project1, glc: LoopCommunicator): +@pytest.mark.usefixtures('setup_test_project1') +async def test_all(): assert_image_count(0) assert GLOBALS.data_folder == '/tmp/learning_loop_lib_data' @@ -27,18 +26,18 @@ async def test_all(setup_test_project1, glc: LoopCommunicator): details = {'categories': [jsonable_encoder(asdict(Category(id='some_id', name='some_category_name')))], 'id': '798bfbf1-8948-2ea9-32fb-3571b6748bca', # version 1.2 of demo project 'training_number': 0, - 'resolution': 800, - 'flip_rl': False, - 'flip_ud': False} + 'model_variant': '', + 'hyperparameters': { + 'resolution': 800, + 'flip_rl': False, + 'flip_ud': False} + } # await asyncio.sleep(100) trainer._node = node - trainer._init_new_training(context=context, details=details) + trainer._init_new_training(context=context, training_config=details) + trainer.training.model_uuid_for_detecting = latest_model_id - project_folder = create_project_folder(context) - training = generate_training(project_folder, context) - training.model_uuid_for_detecting = latest_model_id - trainer._training = training await trainer._do_detections() detections = trainer.active_training_io.load_detections() diff --git a/mock_trainer/app_code/tests/test_mock_trainer.py b/mock_trainer/app_code/tests/test_mock_trainer.py index e2b518b0..79b2e087 100644 --- a/mock_trainer/app_code/tests/test_mock_trainer.py +++ b/mock_trainer/app_code/tests/test_mock_trainer.py @@ -1,7 +1,9 @@ from typing import Dict from uuid import uuid4 -from learning_loop_node.data_classes import Context, Model, Training, TrainingData +import pytest + +from learning_loop_node.data_classes import Context, TrainerState, Training from learning_loop_node.globals import GLOBALS from learning_loop_node.trainer.executor import Executor @@ -17,7 +19,8 @@ async def create_mock_trainer() -> MockTrainerLogic: return mock_trainer -async def test_get_model_files(setup_test_project2): +@pytest.mark.usefixtures('setup_test_project2') +async def test_get_model_files(): mock_trainer = await create_mock_trainer() files = await mock_trainer._get_latest_model_files() @@ -28,18 +31,23 @@ async def test_get_model_files(setup_test_project2): assert files['mocked_2'] == ['/tmp/weightfile.weights', '/tmp/some_more_data.txt'] -async def test_get_new_model(setup_test_project2): +@pytest.mark.usefixtures('setup_test_project2') +async def test_get_new_model(): mock_trainer = await create_mock_trainer() await mock_trainer._start_training_from_base_model() - model = Model(uuid=(str(uuid4()))) context = Context(organization="", project="") mock_trainer._training = Training( # pylint: disable=protected-access id=str(uuid4()), context=context, project_folder="", images_folder="", - training_folder="",) - mock_trainer.training.data = TrainingData(image_data=[], categories=[]) + training_folder="", + categories=[], + hyperparameters={}, + model_variant='', + training_number=0, + training_state=TrainerState.Preparing) + mock_trainer._training.image_data = [] model = mock_trainer._get_new_best_training_state() assert model is not None