Skip to content

Commit 0a364d8

Browse files
authoredMay 20, 2024
Image cache and balloon on H100s, also temporarily stop people from using A100 (#523)
* Cache H100 * Stop people from using A100 * no cover * no cover * update client version
1 parent 9261c49 commit 0a364d8

File tree

8 files changed

+85
-3
lines changed

8 files changed

+85
-3
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{{- if not .Values.serviceIdentifier }}
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: {{ .Chart.Name }}-balloon-h100
6+
labels:
7+
team: infra
8+
product: common-warm-nodes
9+
spec:
10+
replicas: {{ .Values.replicaCount.balloonH100 }}
11+
selector:
12+
matchLabels:
13+
app: {{ .Chart.Name }}-balloon-h100
14+
version: v1
15+
template:
16+
metadata:
17+
labels:
18+
app: {{ .Chart.Name }}-balloon-h100
19+
product: common-warm-nodes
20+
team: infra
21+
env: {{ .Values.context }}
22+
version: v1
23+
annotations:
24+
sidecar.istio.io/inject: "false"
25+
spec:
26+
nodeSelector:
27+
k8s.amazonaws.com/accelerator: nvidia-ampere-h100
28+
{{- with .Values.balloonNodeSelector }}
29+
{{- toYaml . | nindent 8 }}
30+
{{- end }}
31+
tolerations:
32+
- key: "nvidia.com/gpu"
33+
operator: "Exists"
34+
effect: "NoSchedule"
35+
containers:
36+
- image: public.ecr.aws/ubuntu/ubuntu:latest
37+
imagePullPolicy: IfNotPresent
38+
name: main
39+
resources:
40+
limits:
41+
memory: 28Gi
42+
nvidia.com/gpu: 1
43+
cpu: 4
44+
command:
45+
- /bin/bash
46+
- -c
47+
- "while true; do sleep 30; done"
48+
terminationGracePeriodSeconds: 0
49+
priorityClassName: {{ .Chart.Name }}-low-priority
50+
{{- end }}

‎charts/model-engine/values_circleci.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ replicaCount:
88
balloonA100: 0
99
balloonCpu: 0
1010
balloonT4: 0
11+
balloonH100: 0
1112

1213
# tag needs to be set dynamically every time. Usually it is set to the SHA1 hash of the git
1314
# commit from which the image was built.

‎charts/model-engine/values_sample.yaml

+23
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ replicaCount:
8181
balloonCpu: 0
8282
# balloonT4 is a low priority pod deployment for T4 GPU nodes
8383
balloonT4: 0
84+
# balloonH100 is a low priority pod deployment for H100 GPU nodes
85+
balloonH100: 0
8486

8587
# autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
8688
autoscaling:
@@ -254,6 +256,27 @@ imageCache:
254256
- key: "nvidia.com/gpu"
255257
operator: "Exists"
256258
effect: "NoSchedule"
259+
- name: h100
260+
nodeSelector:
261+
k8s.amazonaws.com/accelerator: nvidia-hopper-h100
262+
tolerations:
263+
- key: "nvidia.com/gpu"
264+
operator: "Exists"
265+
effect: "NoSchedule"
266+
- name: h100-mig-1g-20gb
267+
nodeSelector:
268+
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g-20gb
269+
tolerations:
270+
- key: "nvidia.com/gpu"
271+
operator: "Exists"
272+
effect: "NoSchedule"
273+
- name: h100-mig-3g-40gb
274+
nodeSelector:
275+
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g-40gb
276+
tolerations:
277+
- key: "nvidia.com/gpu"
278+
operator: "Exists"
279+
effect: "NoSchedule"
257280

258281
# celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
259282
celeryBrokerType: sqs

‎clients/python/llmengine/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "0.0.0b32"
15+
__version__ = "0.0.0b33"
1616

1717
import os
1818
from typing import Sequence

‎clients/python/llmengine/model.py

+4
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ def create(
149149
- ``nvidia-ampere-a100``
150150
- ``nvidia-ampere-a100e``
151151
- ``nvidia-hopper-h100``
152+
- ``nvidia-hopper-h100-1g20gb``
153+
- ``nvidia-hopper-h100-3g40gb``
152154
153155
high_priority (`Optional[bool]`):
154156
Either ``True`` or ``False``. Enabling this will allow the created
@@ -533,6 +535,8 @@ def update(
533535
- ``nvidia-ampere-a100``
534536
- ``nvidia-ampere-a100e``
535537
- ``nvidia-hopper-h100``
538+
- ``nvidia-hopper-h100-1g20gb``
539+
- ``nvidia-hopper-h100-3g40gb``
536540
537541
high_priority (`Optional[bool]`):
538542
Either ``True`` or ``False``. Enabling this will allow the created

‎clients/python/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "scale-llm-engine"
3-
version = "0.0.0.beta32"
3+
version = "0.0.0.beta33"
44
description = "Scale LLM Engine Python client"
55
license = "Apache-2.0"
66
authors = ["Phil Chen <phil.chen@scale.com>"]

‎clients/python/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
setup(
44
name="scale-llm-engine",
55
python_requires=">=3.7",
6-
version="0.0.0.beta32",
6+
version="0.0.0.beta33",
77
packages=find_packages(),
88
package_data={"llmengine": ["py.typed"]},
99
)

‎model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py

+4
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,10 @@ async def execute(
879879
max_workers=request.max_workers,
880880
endpoint_type=request.endpoint_type,
881881
)
882+
if request.gpu_type == GpuType.NVIDIA_AMPERE_A100E: # pragma: no cover
883+
raise ObjectHasInvalidValueException(
884+
"We have migrated A100 usage to H100. Please request for H100 instead!"
885+
)
882886
if request.labels is None:
883887
raise EndpointLabelsException("Endpoint labels cannot be None!")
884888
validate_labels(request.labels)

0 commit comments

Comments
 (0)
Please sign in to comment.