compose-llms.yaml

version: '3'

services:
  tgi:
    image: ghcr.io/huggingface/text-generation-inference:latest
    container_name: tgi
    ports:
      - 8080:80
    volumes:
      - ${LOCAL_MODEL_CACHE_DIR}:/model_cache
    environment:
      - HUGGING_FACE_HUB_TOKEN=${LLAMA_TOKEN}
    # need this to access GPU
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    command: 
      - '--huggingface-hub-cache' 
      - '/model_cache' 
      - '--model-id'
      - '${MODEL_ID}' 
      - '--max-batch-prefill-tokens'
      - '${MAX_PREFILL_TOKENS}' 
      - '--quantize' 
      - '${QUANT}'
      - '--max-total-tokens'
      - '${MAX_TOTAL_TOKENS}'
      - '--max-input-length'
      - '${MAX_INPUT_LENGTH}'
    shm_size: 1gb
  ui:
    image: localllm-ui:latest
    container_name: ui
    build:
      context: ./chat_ui/
    ports:
      - 7000:7000

  # api:
  #   image: